/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/3643/include/ck/library/utility/host_tensor.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/3643/include/ck/library/utility/host_tensor.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/3643/include/ck/library/utility/host_tensor.hpp Source File
host_tensor.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
6 #include <algorithm>
7 #include <cassert>
8 #include <iostream>
9 #include <fstream>
10 #include <numeric>
11 #include <random>
12 #include <thread>
13 #include <utility>
14 #include <vector>
15 
16 #include "ck/utility/data_type.hpp"
17 #include "ck/utility/span.hpp"
19 
23 
25 
26 namespace ck {
27 
28 template <typename Range>
29 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
30 {
31  bool first = true;
32  for(auto&& v : range)
33  {
34  if(first)
35  first = false;
36  else
37  os << delim;
38  os << v;
39  }
40  return os;
41 }
42 
43 template <typename T, typename Range>
44 std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
45 {
46  bool first = true;
47  for(auto&& v : range)
48  {
49  if(first)
50  first = false;
51  else
52  os << delim;
53 
54  using RangeType = ck::remove_cvref_t<decltype(v)>;
55  if constexpr(std::is_same_v<RangeType, ck::f8_t> || std::is_same_v<RangeType, ck::bf8_t> ||
56  std::is_same_v<RangeType, ck::bhalf_t>)
57  {
58  os << ck::type_convert<float>(v);
59  }
60  else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t> ||
61  std::is_same_v<RangeType, ck::f4x2_pk_t>)
62  {
63  const auto packed_floats = ck::type_convert<ck::float2_t>(v);
64  const ck::vector_type<float, 2> vector_of_floats{packed_floats};
65  os << vector_of_floats.template AsType<float>()[ck::Number<0>{}] << delim
66  << vector_of_floats.template AsType<float>()[ck::Number<1>{}];
67  }
68  else
69  {
70  os << static_cast<T>(v);
71  }
72  }
73  return os;
74 }
75 
76 template <typename F, typename T, std::size_t... Is>
77 auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
78 {
79  return f(std::get<Is>(args)...);
80 }
81 
82 template <typename F, typename T>
83 auto call_f_unpack_args(F f, T args)
84 {
85  constexpr std::size_t N = std::tuple_size<T>{};
86 
87  return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
88 }
89 
90 template <typename F, typename T, std::size_t... Is>
91 auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
92 {
93  return F(std::get<Is>(args)...);
94 }
95 
96 template <typename F, typename T>
97 auto construct_f_unpack_args(F, T args)
98 {
99  constexpr std::size_t N = std::tuple_size<T>{};
100 
101  return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
102 }
103 
173 {
176 
177  // Runtime tag describing which layout is picked when layout is not specified explicitly at
178  // construction time.
179  enum class ChosenLayout
180  {
181  Original,
182  RowMajor,
184  };
185 
186  // Master constructor
187  template <typename Layout>
188  HostTensorDescriptor(std::vector<std::size_t> lens,
189  std::vector<std::size_t> strides,
190  const Layout& layout = DefaultLayout())
191  : mLens(std::move(lens)), mStrides(std::move(strides))
192  {
193  // To support legacy use cases, when layout is not passed in
194  const auto new_layout = HandleDefaultLayout(layout);
195  if(dbg)
196  {
197  std::cout << "Original Lens: [";
198  LogRange(std::cout, mLens, ", ") << "] and Strides: [";
199  LogRange(std::cout, mStrides, ", ") << "]" << std::endl;
200  std::cout << "Layout: " << layout << " --> " << new_layout << std::endl;
201  }
202 
203  // Handling the strides and validation based on the chosen layout
204  DispatchChosenLayout(new_layout, layout, [&](auto selected_layout) {
205  this->CalculateStrides(selected_layout);
206  this->ValidateStrides(selected_layout);
207  });
208  }
209 
211 
212  // Helper that invokes a callable with a concrete layout object whose type
213  // matches the chosen tag (so template code depending on the layout type
214  // can still leverage if constexpr branches).
215  template <typename F, typename OrigLayout>
216  void DispatchChosenLayout(ChosenLayout tag, const OrigLayout& orig, F&& f) const
217  {
218  switch(tag)
219  {
223  default: f(orig); break;
224  }
225  }
226 
227  template <typename Layout>
229  {
230  if constexpr(!std::is_same_v<Layout, DefaultLayout>)
231  {
232  return ChosenLayout::Original;
233  }
234  else
235  {
236  if(mStrides.empty())
237  {
238  // No strides provided -> assume RowMajor
239  return ChosenLayout::RowMajor;
240  }
241 
242  const auto rank = mLens.size();
243 
244  if(rank > 2)
245  {
246  // Keep as-is - validation will warn/throw later
247  return ChosenLayout::Original;
248  }
249 
250  if(rank == 0)
251  {
252  // Keep as-is - validation will warn/throw later
253  return ChosenLayout::Original;
254  }
255 
256  if(rank == 1)
257  {
258  // Treat 1D tensor as RowMajor
259  return ChosenLayout::RowMajor;
260  }
261 
262  // rank == 2
263  if(mStrides.size() == 2)
264  {
265  // RowMajor pattern (?, 1)
266  if(mStrides[1] == 1)
267  {
268  return ChosenLayout::RowMajor;
269  }
270 
271  // ColumnMajor pattern (1, ?)
272  if(mStrides[0] == 1)
273  {
275  }
276  }
277 
278  // Fallback: leave as-is
279  return ChosenLayout::Original;
280  }
281  }
282 
283  template <typename Layout>
285  {
286  if constexpr(std::is_same_v<Layout, ck::tensor_layout::BypassLayoutVerification>)
287  return;
288  // This is a workaround if the original stride value is -1 (which means "unknown") has been
289  // passed in and casted to size_t (unsigned).
290  auto strides_int = AsInt(mStrides);
291 
292  // case of empty strides or all-zero: auto-calculate based on layout and tensor dimensions
293  if(mStrides.empty() || std::all_of(strides_int.begin(), strides_int.end(), [](int stride) {
294  return stride <= 0;
295  }))
296  {
297 
298  if constexpr(!(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
299  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>))
300  {
301  if(dbg)
302  {
303  std::cerr << "Only RowMajor and ColumnMajor layouts are supported for empty "
304  "strides, got "
305  << layout << ". Will calculate strides as RowMajor." << std::endl;
306  }
307  }
308 
309  mStrides.clear();
310  mStrides.resize(mLens.size(), 0);
311  if(mStrides.empty())
312  return;
313 
314  mStrides.back() = 1;
315  std::partial_sum(mLens.rbegin(),
316  mLens.rend() - 1,
317  mStrides.rbegin() + 1,
318  std::multiplies<std::size_t>());
319 
320  if constexpr(std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
321  {
322  // swap the last two strides
323  if(mStrides.size() >= 2)
324  std::swap(mStrides[mStrides.size() - 1], mStrides[mStrides.size() - 2]);
325  }
326  }
327  // The other case is if one of the strides is unknown
328  // Currently, only GEMM RowMajor and ColumnMajor layouts are supported and only in the lower
329  // two dimensions, e.g. {..., 0, N} or {..., M, 0}. The higher dimensions are left
330  // untouched.
331  else if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
332  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
333  {
334  auto rank = mStrides.size();
335  if(mLens.size() >= 2 && rank >= 2)
336  {
337  const auto inner_idx =
338  std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
339  const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
340  if(mStrides[inner_idx] <= 0)
341  {
342  mStrides[inner_idx] = 1;
343  }
344  if(mStrides[outer_idx] <= 0)
345  {
346  mStrides[outer_idx] = mLens[inner_idx] * mStrides[inner_idx];
347  }
348  }
349  }
350  }
351 
352  template <typename Layout>
353  void ValidateStrides(const Layout& layout) const
354  {
355  if constexpr(std::is_same_v<ck::tensor_layout::BypassLayoutVerification, Layout>)
356  {
357  return;
358  }
359 
360  if(mLens.empty())
361  {
362  throw std::runtime_error(
363  "HostTensorDescriptor::ValidateStrides: empty tensor dimensions is not allowed.");
364  }
365 
366  const int rank = mLens.size();
367  if(rank == 1) // skip any 1D tensors
368  {
369  return;
370  }
371 
372  if constexpr(std::is_same_v<ck::tensor_layout::BaseTensorLayout, Layout>)
373  {
374  // Any legacy code that doesn't pass layout to HostTensorDescriptor ctor will
375  // hit this case (unless it is a special case - see `HandleDefaultLayout`).
376  throw std::runtime_error("HostTensorDescriptor::ValidateStrides: Abstract tensor "
377  "layout BaseTensorLayout can't be verified. Pls "
378  "pass specific tensor layout to HostTensorDescriptor (or "
379  "ck::tensor_layout::BypassLayoutVerification)");
380  }
381 
382  // GEMM cases
383  if constexpr(std::is_base_of_v<ck::tensor_layout::gemm::BaseGemmLayout, Layout>)
384  {
385  if(mLens.size() != mStrides.size())
386  {
387  std::ostringstream oss;
388  oss << "HostTensorDescriptor::ValidateStrides: mismatch between tensor rank and "
389  "size of strides: "
390  << *this;
391  throw std::runtime_error(oss.str());
392  }
393 
394  // in GEMM, strides must be all positive or all zeros (auto-derived from tensor
395  // dimensions)
396  auto strides_int = AsInt(mStrides);
397  if(std::any_of(
398  strides_int.begin(), strides_int.end(), [](int stride) { return stride <= 0; }))
399  {
400  std::ostringstream oss;
401  oss << "Stride values must be positive or all-zeros (auto-derived from tensor "
402  "dimensions). Instead got ";
403  std::copy(
404  strides_int.begin(), strides_int.end(), std::ostream_iterator<int>(oss, " "));
405  throw std::runtime_error(oss.str());
406  }
407 
408  if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
409  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
410  {
411  // The logic here assumes the GEMM with tensor of more than 2 dims, will always have
412  // HW dimesnsions as the inner ones e.g. batched GEMM is either BHW or BWH
413  const auto inner_idx =
414  std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
415  const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
416 
417  if(mStrides[outer_idx] < mLens[inner_idx] * mStrides[inner_idx])
418  {
419  std::ostringstream oss;
420  oss << "Invalid strides for " << layout << ": " << *this;
421  throw std::runtime_error(oss.str());
422  }
423 
424  // For higher dimensions, validate strides assuming RowMajor
425  for(int i = 1; i < rank - 2; ++i)
426  {
427  if(mStrides[i - 1] < mStrides[i] * mLens[i])
428  {
429  std::ostringstream oss;
430  oss << "Invalid strides for higher dimensions in " << layout << ": "
431  << *this;
432  throw std::runtime_error(oss.str());
433  }
434  }
435  }
436  else
437  {
438  std::ostringstream oss;
439  oss << "Error: Unsupported GEMM layout: " << layout;
440  throw std::runtime_error(oss.str());
441  }
442  }
443  // Convolution cases
445  Layout>)
446  {
447  // TBD: implement verification for Conv layouts
448  // For now, just print warning and return
449  if(dbg)
450  {
451 
452  std::cerr
453  << "Warning: Tensor layout verification for ck::tensor_layout::convolution "
454  "layouts is not supported yet. Skipping..."
455  << std::endl;
456  }
457  return;
458  }
459  else
460  {
461  std::ostringstream oss;
462  oss << "Error: Tensor layout verification for " << layout << " is not supported yet.";
463  throw std::runtime_error(oss.str());
464  }
465  }
466 
467  template <typename X,
468  typename Layout = DefaultLayout,
469  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
470  std::is_convertible_v<Layout, BaseTensorLayout>>>
471  HostTensorDescriptor(const std::initializer_list<X>& lens, const Layout& layout = Layout{})
472  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
473  {
474  if(dbg)
475  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
476  }
477 
478  template <typename Layout = DefaultLayout,
479  typename = std::enable_if_t<std::is_convertible_v<Layout, BaseTensorLayout>>>
480  HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
481  const Layout& layout = Layout{})
482  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
483  {
484  if(dbg)
485  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
486  }
487 
488  template <typename Lengths,
489  typename Layout = DefaultLayout,
490  typename = std::enable_if_t<
491  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
492  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>) &&
493  std::is_convertible_v<Layout, BaseTensorLayout>>>
494  HostTensorDescriptor(const Lengths& lens, const Layout& layout = Layout{})
495  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
496  {
497  if(dbg)
498  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
499  }
500 
501  template <typename X,
502  typename Y,
503  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
504  std::is_convertible_v<Y, std::size_t>>,
505  typename Layout = DefaultLayout>
506  HostTensorDescriptor(const std::initializer_list<X>& lens,
507  const std::initializer_list<Y>& strides,
508  const Layout& layout = Layout{})
509  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
510  std::vector<std::size_t>(strides.begin(), strides.end()),
511  layout)
512  {
513  if(dbg)
514  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
515  }
516 
517  // HostTensorDescriptor({row, col}, {row_stride, col_stride})
518  template <typename Layout = DefaultLayout>
519  HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
520  const std::initializer_list<ck::long_index_t>& strides,
521  const Layout& layout = Layout{})
522  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
523  std::vector<std::size_t>(strides.begin(), strides.end()),
524  layout)
525  {
526  if(dbg)
527  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
528  }
529 
530  // HostTensorDescriptor({row, col}, strides)
531  template <typename Strides, typename Layout = DefaultLayout>
532  HostTensorDescriptor(const std::initializer_list<std::size_t>& lens,
533  const Strides& strides,
534  const Layout& layout = Layout{})
535  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
536  std::vector<std::size_t>(strides.begin(), strides.end()),
537  layout)
538  {
539  if(dbg)
540  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
541  }
542 
543  template <typename Lengths,
544  typename Strides,
545  typename Layout = DefaultLayout,
546  typename = std::enable_if_t<
547  ((std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
548  std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
549  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
550  std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)) &&
551  std::is_convertible_v<Layout, BaseTensorLayout>>>
552  HostTensorDescriptor(const Lengths& lens,
553  const Strides& strides,
554  const Layout& layout = Layout{})
555  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
556  std::vector<std::size_t>(strides.begin(), strides.end()),
557  layout)
558  {
559  if(dbg)
560  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
561  }
562 
563  std::size_t GetNumOfDimension() const;
564  std::size_t GetElementSize() const;
565  std::size_t GetElementSpaceSize() const;
566 
567  const std::vector<std::size_t>& GetLengths() const;
568  const std::vector<std::size_t>& GetStrides() const;
569 
570  template <typename... Is>
571  std::size_t GetOffsetFromMultiIndex(Is... is) const
572  {
573  assert(sizeof...(Is) == this->GetNumOfDimension());
574  std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
575  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
576  }
577 
578  std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
579  {
580  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
581  }
582 
583  friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
584  friend std::ostream& operator<<(std::ostream& os, ChosenLayout tag);
585 
586  private:
587  std::vector<std::size_t> mLens;
588  std::vector<std::size_t> mStrides;
589  static constexpr bool dbg = false;
590 
597  std::vector<int> AsInt(const std::vector<size_t>& vec) const
598  {
599  std::vector<int> strides_int(vec.size());
600  std::transform(vec.begin(), vec.end(), strides_int.begin(), [](std::size_t stride) {
601  return static_cast<int>(stride);
602  });
603  return strides_int;
604  }
605 };
606 
607 template <typename New2Old, typename NewLayout = HostTensorDescriptor::BaseTensorLayout>
608 HostTensorDescriptor
610  const New2Old& new2old,
611  const NewLayout& new_layout = NewLayout())
612 {
613  std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
614  std::vector<std::size_t> new_strides(a.GetNumOfDimension());
615 
616  for(std::size_t i = 0; i < a.GetNumOfDimension(); i++)
617  {
618  new_lengths[i] = a.GetLengths()[new2old[i]];
619  new_strides[i] = a.GetStrides()[new2old[i]];
620  }
621 
622  return HostTensorDescriptor(new_lengths, new_strides, new_layout);
623 }
624 
625 struct joinable_thread : std::thread
626 {
627  template <typename... Xs>
628  joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
629  {
630  }
631 
634 
636  {
637  if(this->joinable())
638  this->join();
639  }
640 };
641 
642 template <typename F, typename... Xs>
644 {
645  F mF;
646  static constexpr std::size_t NDIM = sizeof...(Xs);
647  std::array<std::size_t, NDIM> mLens;
648  std::array<std::size_t, NDIM> mStrides;
649  std::size_t mN1d;
650 
651  ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
652  {
653  mStrides.back() = 1;
654  std::partial_sum(mLens.rbegin(),
655  mLens.rend() - 1,
656  mStrides.rbegin() + 1,
657  std::multiplies<std::size_t>());
658  mN1d = mStrides[0] * mLens[0];
659  }
660 
661  std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
662  {
663  std::array<std::size_t, NDIM> indices;
664 
665  for(std::size_t idim = 0; idim < NDIM; ++idim)
666  {
667  indices[idim] = i / mStrides[idim];
668  i -= indices[idim] * mStrides[idim];
669  }
670 
671  return indices;
672  }
673 
674  void operator()(std::size_t num_thread = 1) const
675  {
676  std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
677 
678  std::vector<joinable_thread> threads(num_thread);
679 
680  for(std::size_t it = 0; it < num_thread; ++it)
681  {
682  std::size_t iw_begin = it * work_per_thread;
683  std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
684 
685  auto f = [=, *this] {
686  for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
687  {
689  }
690  };
691  threads[it] = joinable_thread(f);
692  }
693  }
694 };
695 
696 template <typename F, typename... Xs>
697 auto make_ParallelTensorFunctor(F f, Xs... xs)
698 {
699  return ParallelTensorFunctor<F, Xs...>(f, xs...);
700 }
701 
702 template <typename T>
703 struct Tensor
704 {
706  using Data = std::vector<T>;
707 
708  template <typename X>
709  Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(GetElementSpaceSize())
710  {
711  }
712 
713  template <typename X, typename Y>
714  Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
715  : mDesc(lens, strides), mData(GetElementSpaceSize())
716  {
717  }
718 
719  template <typename Lengths>
720  Tensor(const Lengths& lens) : mDesc(lens), mData(GetElementSpaceSize())
721  {
722  }
723 
724  template <typename Lengths, typename Strides>
725  Tensor(const Lengths& lens, const Strides& strides)
726  : mDesc(lens, strides), mData(GetElementSpaceSize())
727  {
728  }
729 
730  template <typename X, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
731  Tensor(std::initializer_list<X> lens, Rest&&... rest)
732  : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
733  {
734  }
735 
736  template <typename X,
737  typename Y,
738  typename... Rest,
739  std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
740  Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides, Rest&&... rest)
741  : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
742  {
743  }
744 
745  template <typename Lengths, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
746  Tensor(const Lengths& lens, Rest&&... rest)
747  : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
748  {
749  }
750 
751  template <typename Lengths,
752  typename Strides,
753  typename... Rest,
754  std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
755  Tensor(const Lengths& lens, const Strides& strides, Rest&&... rest)
756  : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
757  {
758  }
759 
760  Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {}
761 
762  template <typename OutT>
764  {
765  Tensor<OutT> ret(mDesc);
766 
768  mData, ret.mData.begin(), [](auto value) { return ck::type_convert<OutT>(value); });
769 
770  return ret;
771  }
772 
773  Tensor() = delete;
774  Tensor(const Tensor&) = default;
775  Tensor(Tensor&&) = default;
776 
777  ~Tensor() = default;
778 
779  Tensor& operator=(const Tensor&) = default;
780  Tensor& operator=(Tensor&&) = default;
781 
782  template <typename FromT>
783  explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
784  {
785  }
786  void savetxt(std::string file_name, std::string dtype = "float")
787  {
788  std::ofstream file(file_name);
789 
790  if(file.is_open())
791  {
792  for(auto& itm : mData)
793  {
794  if(dtype == "float")
795  file << ck::type_convert<float>(itm) << std::endl;
796  else if(dtype == "int")
797  file << ck::type_convert<int>(itm) << std::endl;
798  else
799  // TODO: we didn't implement operator<< for all custom
800  // data types, here fall back to float in case compile error
801  file << ck::type_convert<float>(itm) << std::endl;
802  }
803  file.close();
804  }
805  else
806  {
807  // Print an error message to the standard error
808  // stream if the file cannot be opened.
809  throw std::runtime_error(std::string("unable to open file:") + file_name);
810  }
811  }
812  decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
813 
814  decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
815 
816  std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
817 
818  std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
819 
820  std::size_t GetElementSpaceSize() const
821  {
823  {
824  return (mDesc.GetElementSpaceSize() + 1) / ck::packed_size_v<ck::remove_cvref_t<T>>;
825  }
826  else
827  {
828  return mDesc.GetElementSpaceSize();
829  }
830  }
831 
832  std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
833 
834  void SetZero() { ck::ranges::fill<T>(mData, T{0}); }
835 
836  template <typename F>
837  void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
838  {
839  if(rank == mDesc.GetNumOfDimension())
840  {
841  f(*this, idx);
842  return;
843  }
844  // else
845  for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
846  {
847  idx[rank] = i;
848  ForEach_impl(std::forward<F>(f), idx, rank + 1);
849  }
850  }
851 
852  template <typename F>
853  void ForEach(F&& f)
854  {
855  std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
856  ForEach_impl(std::forward<F>(f), idx, size_t(0));
857  }
858 
859  template <typename F>
860  void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
861  {
862  if(rank == mDesc.GetNumOfDimension())
863  {
864  f(*this, idx);
865  return;
866  }
867  // else
868  for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
869  {
870  idx[rank] = i;
871  ForEach_impl(std::forward<const F>(f), idx, rank + 1);
872  }
873  }
874 
875  template <typename F>
876  void ForEach(const F&& f) const
877  {
878  std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
879  ForEach_impl(std::forward<const F>(f), idx, size_t(0));
880  }
881 
882  template <typename G>
883  void GenerateTensorValue(G g, std::size_t num_thread = 1)
884  {
885  switch(mDesc.GetNumOfDimension())
886  {
887  case 1: {
888  auto f = [&](auto i) { (*this)(i) = g(i); };
889  make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
890  break;
891  }
892  case 2: {
893  auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
894  make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
895  break;
896  }
897  case 3: {
898  auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
900  f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
901  break;
902  }
903  case 4: {
904  auto f = [&](auto i0, auto i1, auto i2, auto i3) {
905  (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
906  };
908  mDesc.GetLengths()[0],
909  mDesc.GetLengths()[1],
910  mDesc.GetLengths()[2],
911  mDesc.GetLengths()[3])(num_thread);
912  break;
913  }
914  case 5: {
915  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
916  (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
917  };
919  mDesc.GetLengths()[0],
920  mDesc.GetLengths()[1],
921  mDesc.GetLengths()[2],
922  mDesc.GetLengths()[3],
923  mDesc.GetLengths()[4])(num_thread);
924  break;
925  }
926  case 6: {
927  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
928  (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
929  };
931  mDesc.GetLengths()[0],
932  mDesc.GetLengths()[1],
933  mDesc.GetLengths()[2],
934  mDesc.GetLengths()[3],
935  mDesc.GetLengths()[4],
936  mDesc.GetLengths()[5])(num_thread);
937  break;
938  }
939  case 12: {
940  auto f = [&](auto i0,
941  auto i1,
942  auto i2,
943  auto i3,
944  auto i4,
945  auto i5,
946  auto i6,
947  auto i7,
948  auto i8,
949  auto i9,
950  auto i10,
951  auto i11) {
952  (*this)(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) =
953  g(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11);
954  };
956  mDesc.GetLengths()[0],
957  mDesc.GetLengths()[1],
958  mDesc.GetLengths()[2],
959  mDesc.GetLengths()[3],
960  mDesc.GetLengths()[4],
961  mDesc.GetLengths()[5],
962  mDesc.GetLengths()[6],
963  mDesc.GetLengths()[7],
964  mDesc.GetLengths()[8],
965  mDesc.GetLengths()[9],
966  mDesc.GetLengths()[10],
967  mDesc.GetLengths()[11])(num_thread);
968  break;
969  }
970  default: throw std::runtime_error("unspported dimension");
971  }
972  }
973 
974  // Generate random values with multiple threads. Guaranteed to give the same sequence with any
975  // number of threads provided.
976  template <typename Distribution = std::uniform_real_distribution<float>,
977  typename Mapping = ck::identity,
978  typename Generator = std::minstd_rand>
979  void GenerateTensorDistr(Distribution dis = {0.f, 1.f},
980  Mapping fn = {},
981  const Generator g = Generator(0), // default seed 0
982  std::size_t num_thread = -1)
983  {
985  using ck::math::min;
986  if(num_thread == -1ULL)
987  num_thread = min(ck::get_available_cpu_cores(), 80U); // max 80 threads
988  // At least 2MB per thread
989  num_thread = min(num_thread, integer_divide_ceil(this->GetElementSpaceSize(), 0x200000));
990  constexpr std::size_t BLOCK_BYTES = 64;
991  constexpr std::size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T);
992 
993  const std::size_t num_blocks = integer_divide_ceil(this->GetElementSpaceSize(), BLOCK_SIZE);
994  const std::size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
995 
996  std::vector<std::thread> threads;
997  threads.reserve(num_thread - 1);
998  const auto dst = const_cast<T*>(this->mData.data());
999  const auto element_space_size = this->GetElementSpaceSize();
1000  for(int it = num_thread - 1; it >= 0; --it)
1001  {
1002  std::size_t ib_begin = it * blocks_per_thread;
1003  std::size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks);
1004 
1005  auto job = [=]() {
1006  auto g_ = g; // copy
1007  auto dis_ = dis; // copy
1008  g_.discard(ib_begin * BLOCK_SIZE * ck::packed_size_v<T>);
1009  auto t_fn = [&]() {
1010  // As user can pass integer distribution in dis, we must ensure that the correct
1011  // constructor/converter is called at all times. For f4/f6/f8 types, to ensure
1012  // correct results, we convert from float to the target type. In these cases
1013  // integer constructors are interpreted as direct initialization of the internal
1014  // storage with binary values instead of treating integers as subset of floats.
1015  if constexpr(ck::is_same_v<T, ck::f8_t> || ck::is_same_v<T, ck::bf8_t>)
1016  return ck::type_convert<T>(static_cast<float>(fn(dis_(g_))));
1017  else if constexpr(ck::packed_size_v<T> == 1)
1018  return ck::type_convert<T>(fn(dis_(g_)));
1019  else if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
1020  return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
1021  ck::float2_t{ck::type_convert<float>(fn(dis_(g_))),
1022  ck::type_convert<float>(fn(dis_(g_)))})};
1023  else if constexpr(ck::is_same_v<T, ck::f6x32_pk_t> ||
1024  ck::is_same_v<T, ck::bf6x32_pk_t>)
1025  {
1026  return ck::type_convert<T>(
1027  ck::float32_t{ck::type_convert<float>(fn(dis_(g_))),
1028  ck::type_convert<float>(fn(dis_(g_))),
1029  ck::type_convert<float>(fn(dis_(g_))),
1030  ck::type_convert<float>(fn(dis_(g_))),
1031  ck::type_convert<float>(fn(dis_(g_))),
1032  ck::type_convert<float>(fn(dis_(g_))),
1033  ck::type_convert<float>(fn(dis_(g_))),
1034  ck::type_convert<float>(fn(dis_(g_))),
1035  ck::type_convert<float>(fn(dis_(g_))),
1036  ck::type_convert<float>(fn(dis_(g_))),
1037  ck::type_convert<float>(fn(dis_(g_))),
1038  ck::type_convert<float>(fn(dis_(g_))),
1039  ck::type_convert<float>(fn(dis_(g_))),
1040  ck::type_convert<float>(fn(dis_(g_))),
1041  ck::type_convert<float>(fn(dis_(g_))),
1042  ck::type_convert<float>(fn(dis_(g_))),
1043  ck::type_convert<float>(fn(dis_(g_))),
1044  ck::type_convert<float>(fn(dis_(g_))),
1045  ck::type_convert<float>(fn(dis_(g_))),
1046  ck::type_convert<float>(fn(dis_(g_))),
1047  ck::type_convert<float>(fn(dis_(g_))),
1048  ck::type_convert<float>(fn(dis_(g_))),
1049  ck::type_convert<float>(fn(dis_(g_))),
1050  ck::type_convert<float>(fn(dis_(g_))),
1051  ck::type_convert<float>(fn(dis_(g_))),
1052  ck::type_convert<float>(fn(dis_(g_))),
1053  ck::type_convert<float>(fn(dis_(g_))),
1054  ck::type_convert<float>(fn(dis_(g_))),
1055  ck::type_convert<float>(fn(dis_(g_))),
1056  ck::type_convert<float>(fn(dis_(g_))),
1057  ck::type_convert<float>(fn(dis_(g_))),
1058  ck::type_convert<float>(fn(dis_(g_)))});
1059  }
1060  else if constexpr(ck::is_same_v<T, ck::f6x16_pk_t> ||
1061  ck::is_same_v<T, ck::bf6x16_pk_t>)
1062  {
1063  return ck::type_convert<T>(
1064  ck::float16_t{ck::type_convert<float>(fn(dis_(g_))),
1065  ck::type_convert<float>(fn(dis_(g_))),
1066  ck::type_convert<float>(fn(dis_(g_))),
1067  ck::type_convert<float>(fn(dis_(g_))),
1068  ck::type_convert<float>(fn(dis_(g_))),
1069  ck::type_convert<float>(fn(dis_(g_))),
1070  ck::type_convert<float>(fn(dis_(g_))),
1071  ck::type_convert<float>(fn(dis_(g_))),
1072  ck::type_convert<float>(fn(dis_(g_))),
1073  ck::type_convert<float>(fn(dis_(g_))),
1074  ck::type_convert<float>(fn(dis_(g_))),
1075  ck::type_convert<float>(fn(dis_(g_))),
1076  ck::type_convert<float>(fn(dis_(g_))),
1077  ck::type_convert<float>(fn(dis_(g_))),
1078  ck::type_convert<float>(fn(dis_(g_))),
1079  ck::type_convert<float>(fn(dis_(g_)))});
1080  }
1081  else
1082  static_assert(false, "Unsupported packed size for T");
1083  };
1084 
1085  std::size_t ib = ib_begin;
1086  for(; ib < ib_end - 1; ++ib)
1087  ck::static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
1088  constexpr size_t iw = iw_.value;
1089  dst[ib * BLOCK_SIZE + iw] = t_fn();
1090  });
1091  for(std::size_t iw = 0; iw < BLOCK_SIZE; ++iw)
1092  if(ib * BLOCK_SIZE + iw < element_space_size)
1093  dst[ib * BLOCK_SIZE + iw] = t_fn();
1094  };
1095 
1096  if(it > 0)
1097  threads.emplace_back(std::move(job));
1098  else
1099  job(); // last job run in the main thread
1100  }
1101  for(auto& t : threads)
1102  t.join();
1103  }
1104 
1105  template <typename... Is>
1106  std::size_t GetOffsetFromMultiIndex(Is... is) const
1107  {
1108  return mDesc.GetOffsetFromMultiIndex(is...) / ck::packed_size_v<ck::remove_cvref_t<T>>;
1109  }
1110 
1111  template <typename... Is>
1112  T& operator()(Is... is)
1113  {
1114  return mData[mDesc.GetOffsetFromMultiIndex(is...) /
1115  ck::packed_size_v<ck::remove_cvref_t<T>>];
1116  }
1117 
1118  template <typename... Is>
1119  const T& operator()(Is... is) const
1120  {
1121  return mData[mDesc.GetOffsetFromMultiIndex(is...) /
1122  ck::packed_size_v<ck::remove_cvref_t<T>>];
1123  }
1124 
1125  T& operator()(const std::vector<std::size_t>& idx)
1126  {
1127  return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
1128  }
1129 
1130  const T& operator()(const std::vector<std::size_t>& idx) const
1131  {
1132  return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
1133  }
1134 
1135  typename Data::iterator begin() { return mData.begin(); }
1136 
1137  typename Data::iterator end() { return mData.end(); }
1138 
1139  typename Data::pointer data() { return mData.data(); }
1140 
1141  typename Data::const_iterator begin() const { return mData.begin(); }
1142 
1143  typename Data::const_iterator end() const { return mData.end(); }
1144 
1145  typename Data::const_pointer data() const { return mData.data(); }
1146 
1147  typename Data::size_type size() const { return mData.size(); }
1148 
1149  template <typename U = T>
1150  auto AsSpan() const
1151  {
1152  constexpr std::size_t FromSize = sizeof(T);
1153  constexpr std::size_t ToSize = sizeof(U);
1154 
1155  using Element = std::add_const_t<std::remove_reference_t<U>>;
1156  return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
1157  }
1158 
1159  template <typename U = T>
1160  auto AsSpan()
1161  {
1162  constexpr std::size_t FromSize = sizeof(T);
1163  constexpr std::size_t ToSize = sizeof(U);
1164 
1165  using Element = std::remove_reference_t<U>;
1166  return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
1167  }
1168 
1171 };
1172 
1173 } // namespace ck
Definition: span.hpp:14
__host__ constexpr __device__ auto rank([[maybe_unused]] const Layout< Shape, UnrolledDescriptorType > &layout)
Get layout rank (num elements in shape).
Definition: layout_utils.hpp:310
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
auto transform(InputRange &&range, OutputIterator iter, UnaryOperation unary_op) -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
Definition: algorithm.hpp:36
auto copy(InputRange &&range, OutputIterator iter) -> decltype(std::copy(std::begin(std::forward< InputRange >(range)), std::end(std::forward< InputRange >(range)), iter))
Definition: algorithm.hpp:14
iter_value_t< ranges::iterator_t< R > > range_value_t
Definition: ranges.hpp:28
Definition: ck.hpp:270
typename vector_type< float, 16 >::type float16_t
Definition: dtype_vector.hpp:2149
unsigned int get_available_cpu_cores()
Definition: thread.hpp:11
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor &a, const New2Old &new2old, const NewLayout &new_layout=NewLayout())
Definition: host_tensor.hpp:609
std::ostream & LogRange(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:29
auto call_f_unpack_args_impl(F f, T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:77
int64_t long_index_t
Definition: ck.hpp:302
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2146
auto construct_f_unpack_args_impl(T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:91
__host__ constexpr __device__ Y type_convert(X x)
Definition: type_convert.hpp:98
constexpr bool is_base_of_v
Definition: type.hpp:286
auto construct_f_unpack_args(F, T args)
Definition: host_tensor.hpp:97
constexpr bool is_same_v
Definition: type.hpp:283
auto make_ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:697
constexpr bool is_packed_type_v
Definition: data_type.hpp:414
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
auto call_f_unpack_args(F f, T args)
Definition: host_tensor.hpp:83
__device__ void inner_product(const TA &a, const TB &b, TC &c)
typename vector_type< float, 32 >::type float32_t
Definition: dtype_vector.hpp:2150
std::ostream & LogRangeAsType(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:44
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
const GenericPointer< typename T::ValueType > & pointer
Definition: pointer.h:1514
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1517
Layout wrapper that performs the tensor descriptor logic.
Definition: layout.hpp:24
A descriptor class for host tensors that manages tensor dimensions, strides, and layout.
Definition: host_tensor.hpp:173
const std::vector< std::size_t > & GetStrides() const
HostTensorDescriptor()
Definition: host_tensor.hpp:210
ChosenLayout HandleDefaultLayout(const Layout &)
Definition: host_tensor.hpp:228
void DispatchChosenLayout(ChosenLayout tag, const OrigLayout &orig, F &&f) const
Definition: host_tensor.hpp:216
HostTensorDescriptor(const std::initializer_list< X > &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:471
std::size_t GetElementSize() const
HostTensorDescriptor(const Lengths &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:494
HostTensorDescriptor(const Lengths &lens, const Strides &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:552
std::size_t GetElementSpaceSize() const
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:480
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:571
void CalculateStrides(const Layout &layout)
Definition: host_tensor.hpp:284
ck::tensor_layout::BaseTensorLayout BaseTensorLayout
Definition: host_tensor.hpp:174
void ValidateStrides(const Layout &layout) const
Definition: host_tensor.hpp:353
HostTensorDescriptor(const std::initializer_list< X > &lens, const std::initializer_list< Y > &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:506
std::size_t GetOffsetFromMultiIndex(const std::vector< std::size_t > &iss) const
Definition: host_tensor.hpp:578
HostTensorDescriptor(const std::initializer_list< std::size_t > &lens, const Strides &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:532
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const std::initializer_list< ck::long_index_t > &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:519
friend std::ostream & operator<<(std::ostream &os, ChosenLayout tag)
std::size_t GetNumOfDimension() const
HostTensorDescriptor(std::vector< std::size_t > lens, std::vector< std::size_t > strides, const Layout &layout=DefaultLayout())
Definition: host_tensor.hpp:188
const std::vector< std::size_t > & GetLengths() const
ChosenLayout
Definition: host_tensor.hpp:180
BaseTensorLayout DefaultLayout
Definition: host_tensor.hpp:175
friend std::ostream & operator<<(std::ostream &os, const HostTensorDescriptor &desc)
Definition: host_tensor.hpp:644
std::array< std::size_t, NDIM > GetNdIndices(std::size_t i) const
Definition: host_tensor.hpp:661
std::size_t mN1d
Definition: host_tensor.hpp:649
std::array< std::size_t, NDIM > mStrides
Definition: host_tensor.hpp:648
static constexpr std::size_t NDIM
Definition: host_tensor.hpp:646
F mF
Definition: host_tensor.hpp:645
std::array< std::size_t, NDIM > mLens
Definition: host_tensor.hpp:647
ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:651
void operator()(std::size_t num_thread=1) const
Definition: host_tensor.hpp:674
Definition: host_tensor.hpp:704
void GenerateTensorDistr(Distribution dis={0.f, 1.f}, Mapping fn={}, const Generator g=Generator(0), std::size_t num_thread=-1)
Definition: host_tensor.hpp:979
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:1106
~Tensor()=default
void ForEach(const F &&f) const
Definition: host_tensor.hpp:876
Tensor(const Lengths &lens, const Strides &strides, Rest &&... rest)
Definition: host_tensor.hpp:755
decltype(auto) GetStrides() const
Definition: host_tensor.hpp:814
std::size_t GetElementSpaceSize() const
Definition: host_tensor.hpp:820
Tensor(std::initializer_list< X > lens)
Definition: host_tensor.hpp:709
Descriptor mDesc
Definition: host_tensor.hpp:1169
auto AsSpan() const
Definition: host_tensor.hpp:1150
void SetZero()
Definition: host_tensor.hpp:834
void ForEach_impl(F &&f, std::vector< size_t > &idx, size_t rank)
Definition: host_tensor.hpp:837
const T & operator()(const std::vector< std::size_t > &idx) const
Definition: host_tensor.hpp:1130
Data::const_pointer data() const
Definition: host_tensor.hpp:1145
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides)
Definition: host_tensor.hpp:714
Data mData
Definition: host_tensor.hpp:1170
std::vector< T > Data
Definition: host_tensor.hpp:706
Tensor(const Descriptor &desc)
Definition: host_tensor.hpp:760
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides, Rest &&... rest)
Definition: host_tensor.hpp:740
T & operator()(Is... is)
Definition: host_tensor.hpp:1112
Tensor & operator=(const Tensor &)=default
Data::pointer data()
Definition: host_tensor.hpp:1139
std::size_t GetElementSpaceSizeInBytes() const
Definition: host_tensor.hpp:832
Tensor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:725
auto AsSpan()
Definition: host_tensor.hpp:1160
Tensor(std::initializer_list< X > lens, Rest &&... rest)
Definition: host_tensor.hpp:731
decltype(auto) GetLengths() const
Definition: host_tensor.hpp:812
Tensor & operator=(Tensor &&)=default
Tensor< OutT > CopyAsType() const
Definition: host_tensor.hpp:763
std::size_t GetNumOfDimension() const
Definition: host_tensor.hpp:816
void ForEach_impl(const F &&f, std::vector< size_t > &idx, size_t rank) const
Definition: host_tensor.hpp:860
Data::size_type size() const
Definition: host_tensor.hpp:1147
void ForEach(F &&f)
Definition: host_tensor.hpp:853
Tensor(const Lengths &lens)
Definition: host_tensor.hpp:720
T & operator()(const std::vector< std::size_t > &idx)
Definition: host_tensor.hpp:1125
std::size_t GetElementSize() const
Definition: host_tensor.hpp:818
const T & operator()(Is... is) const
Definition: host_tensor.hpp:1119
Tensor(const Tensor< FromT > &other)
Definition: host_tensor.hpp:783
Data::const_iterator end() const
Definition: host_tensor.hpp:1143
void savetxt(std::string file_name, std::string dtype="float")
Definition: host_tensor.hpp:786
Data::iterator end()
Definition: host_tensor.hpp:1137
Data::iterator begin()
Definition: host_tensor.hpp:1135
Tensor(const Lengths &lens, Rest &&... rest)
Definition: host_tensor.hpp:746
Tensor(const Tensor &)=default
void GenerateTensorValue(G g, std::size_t num_thread=1)
Definition: host_tensor.hpp:883
Tensor(Tensor &&)=default
Data::const_iterator begin() const
Definition: host_tensor.hpp:1141
Tensor()=delete
Definition: integral_constant.hpp:20
Definition: host_tensor.hpp:626
joinable_thread & operator=(joinable_thread &&)=default
~joinable_thread()
Definition: host_tensor.hpp:635
joinable_thread(Xs &&... xs)
Definition: host_tensor.hpp:628
joinable_thread(joinable_thread &&)=default
Definition: functional2.hpp:33
Definition: tensor_layout.hpp:10
Definition: tensor_layout.hpp:31
Definition: tensor_layout.hpp:26
Definition: dtype_vector.hpp:11
__host__ constexpr __device__ const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition: tensor_utils.hpp:162