cxx11_tensor_thread_pool.cpp
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_USE_THREADS
11 
12 
13 #include "main.h"
14 #include <iostream>
15 #include <Eigen/CXX11/Tensor>
16 
17 using Eigen::Tensor;
18 
19 class TestAllocator : public Allocator {
20  public:
22  EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
23  const_cast<TestAllocator*>(this)->alloc_count_++;
24  return internal::aligned_malloc(num_bytes);
25  }
27  const_cast<TestAllocator*>(this)->dealloc_count_++;
29  }
30 
31  int alloc_count() const { return alloc_count_; }
32  int dealloc_count() const { return dealloc_count_; }
33 
34  private:
35  int alloc_count_ = 0;
36  int dealloc_count_ = 0;
37 };
38 
40 {
41  Tensor<float, 3> in1(200, 30, 70);
42  Tensor<float, 3> in2(200, 30, 70);
43  Tensor<double, 3> out(200, 30, 70);
44 
45  in1.setRandom();
46  in2.setRandom();
47 
48  Eigen::ThreadPool tp(internal::random<int>(3, 11));
49  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
50  out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
51 
52  for (int i = 0; i < 200; ++i) {
53  for (int j = 0; j < 30; ++j) {
54  for (int k = 0; k < 70; ++k) {
55  VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
56  }
57  }
58  }
59 }
60 
62 {
63  Tensor<float, 3> in1(200, 30, 70);
64  Tensor<float, 3> in2(200, 30, 70);
65  Tensor<double, 3> out(200, 30, 70);
66 
67  in1.setRandom();
68  in2.setRandom();
69 
70  Eigen::ThreadPool tp(internal::random<int>(3, 11));
71  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
72 
73  Eigen::Barrier b(1);
74  out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
75  b.Wait();
76 
77  for (int i = 0; i < 200; ++i) {
78  for (int j = 0; j < 30; ++j) {
79  for (int k = 0; k < 70; ++k) {
80  VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
81  }
82  }
83  }
84 }
85 
87 {
88  Tensor<float, 3> in1(2,3,7);
89  Tensor<float, 3> in2(2,3,7);
90  Tensor<float, 3> out(2,3,7);
91 
92  in1.setRandom();
93  in2.setRandom();
94 
95  Eigen::ThreadPool tp(internal::random<int>(3, 11));
96  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
97  out.device(thread_pool_device) = in1;
98  out.device(thread_pool_device) += in2 * 3.14f;
99 
100  for (int i = 0; i < 2; ++i) {
101  for (int j = 0; j < 3; ++j) {
102  for (int k = 0; k < 7; ++k) {
103  VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
104  }
105  }
106  }
107 }
108 
109 template<int DataLayout>
111 {
112  Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
113  Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
114  Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
115 
116  t_left.setRandom();
117  t_right.setRandom();
118 
119  // this contraction should be equivalent to a single matrix multiplication
121  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
122 
124  MapXf m_left(t_left.data(), 1500, 1147);
125  MapXf m_right(t_right.data(), 1147, 1400);
126  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
127 
128  Eigen::ThreadPool tp(4);
129  Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
130 
131  // compute results by separate methods
132  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
133  m_result = m_left * m_right;
134 
135  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
136  VERIFY(&t_result.data()[i] != &m_result.data()[i]);
137  if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
138  continue;
139  }
140  if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
141  continue;
142  }
143  std::cout << "mismatch detected at index " << i << ": " << t_result(i)
144  << " vs " << m_result(i) << std::endl;
145  assert(false);
146  }
147 }
148 
149 template<int DataLayout>
151 {
152  Tensor<float, 2, DataLayout> t_left(32, 500);
153  Tensor<float, 2, DataLayout> t_right(32, 28*28);
154  Tensor<float, 2, DataLayout> t_result(500, 28*28);
155 
156  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
157  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
158  t_result = t_result.constant(NAN);
159 
160  // this contraction should be equivalent to a single matrix multiplication
162  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
163 
165  MapXf m_left(t_left.data(), 32, 500);
166  MapXf m_right(t_right.data(), 32, 28*28);
167  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
168 
169  Eigen::ThreadPool tp(12);
170  Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
171 
172  // compute results by separate methods
173  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
174  m_result = m_left.transpose() * m_right;
175 
176  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
177  assert(!(numext::isnan)(t_result.data()[i]));
178  if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
179  std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
180  assert(false);
181  }
182  }
183 
184  t_left.resize(32, 1);
185  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
186  t_result.resize (1, 28*28);
187  t_result = t_result.constant(NAN);
188  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
189  new(&m_left) MapXf(t_left.data(), 32, 1);
190  m_result = m_left.transpose() * m_right;
191  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
192  assert(!(numext::isnan)(t_result.data()[i]));
193  if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
194  std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
195  assert(false);
196  }
197  }
198 
199  t_left.resize(32, 500);
200  t_right.resize(32, 4);
201  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
202  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
203  t_result.resize (500, 4);
204  t_result = t_result.constant(NAN);
205  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
206  new(&m_left) MapXf(t_left.data(), 32, 500);
207  new(&m_right) MapXf(t_right.data(), 32, 4);
208  m_result = m_left.transpose() * m_right;
209  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
210  assert(!(numext::isnan)(t_result.data()[i]));
211  if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
212  std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
213  assert(false);
214  }
215  }
216 
217  t_left.resize(32, 1);
218  t_right.resize(32, 4);
219  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
220  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
221  t_result.resize (1, 4);
222  t_result = t_result.constant(NAN);
223  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
224  new(&m_left) MapXf(t_left.data(), 32, 1);
225  new(&m_right) MapXf(t_right.data(), 32, 4);
226  m_result = m_left.transpose() * m_right;
227  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
228  assert(!(numext::isnan)(t_result.data()[i]));
229  if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
230  std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
231  assert(false);
232  }
233  }
234 }
235 
236 template<int DataLayout>
238  int contract_size = internal::random<int>(1, 5000);
239 
240  Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
241  contract_size,
242  internal::random<int>(1, 100));
243 
244  Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
245  internal::random<int>(1, 37),
246  contract_size,
247  internal::random<int>(1, 51));
248 
249  left.setRandom();
250  right.setRandom();
251 
252  // add constants to shift values away from 0 for more precision
253  left += left.constant(1.5f);
254  right += right.constant(1.5f);
255 
257  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
258 
259  Eigen::ThreadPool tp(internal::random<int>(2, 11));
260  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
261 
263  st_result = left.contract(right, dims);
264 
265  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
266  tp_result.device(thread_pool_device) = left.contract(right, dims);
267 
268  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
269  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
270  // if both of the values are very small, then do nothing (because the test will fail
271  // due to numerical precision issues when values are small)
272  if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
273  VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
274  }
275  }
276 }
277 
278 // Apply Sqrt to all output elements.
279 struct SqrtOutputKernel {
280  template <typename Index, typename Scalar>
282  const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
283  const TensorContractionParams&, Index, Index, Index num_rows,
284  Index num_cols) const {
285  for (int i = 0; i < num_rows; ++i) {
286  for (int j = 0; j < num_cols; ++j) {
287  output_mapper(i, j) = std::sqrt(output_mapper(i, j));
288  }
289  }
290  }
291 };
292 
293 template <int DataLayout>
296 
297  const int num_threads = internal::random<int>(2, 11);
298  ThreadPool threads(num_threads);
299  Eigen::ThreadPoolDevice device(&threads, num_threads);
300 
301  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
302  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
303  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
304 
305  t_left.setRandom();
306  t_right.setRandom();
307  // Put trash in mat4 to verify contraction clears output memory.
308  t_result.setRandom();
309 
310  // Add a little offset so that the results won't be close to zero.
311  t_left += t_left.constant(1.0f);
312  t_right += t_right.constant(1.0f);
313 
315  MapXf m_left(t_left.data(), 1500, 248);
316  MapXf m_right(t_right.data(), 248, 1400);
318 
319  // this contraction should be equivalent to a single matrix multiplication
320  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
321 
322  // compute results by separate methods
323  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
324 
325  m_result = m_left * m_right;
326 
327  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
328  VERIFY(&t_result.data()[i] != &m_result.data()[i]);
329  VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
330  }
331 }
332 
333 template<int DataLayout>
335 {
336  int contract_size = internal::random<int>(100, 500);
337 
338  Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
339  contract_size,
340  internal::random<int>(10, 40));
341 
343  internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
344  internal::random<int>(1, 20));
345 
346  left.setRandom();
347  right.setRandom();
348 
349  // add constants to shift values away from 0 for more precision
350  left += left.constant(1.5f);
351  right += right.constant(1.5f);
352 
354  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
355 
356  Eigen::ThreadPool tp(internal::random<int>(2, 11));
357  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
358 
360  st_result = left.contract(right, dims);
361 
362  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
363 
364  Eigen::Barrier barrier(1);
365  tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
366  left.contract(right, dims);
367  barrier.Wait();
368 
369  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
370  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
371  // if both of the values are very small, then do nothing (because the test
372  // will fail due to numerical precision issues when values are small)
373  if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
374  VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
375  }
376  }
377 }
378 
379 // We are triggering 'evalShardedByInnerDim' optimization.
380 template <int DataLayout>
382 {
384 
385  const int num_threads = internal::random<int>(4, 16);
386  ThreadPool threads(num_threads);
387  Eigen::ThreadPoolDevice device(&threads, num_threads);
388 
389  Tensor<float, 2, DataLayout> t_left(2, 10000);
390  Tensor<float, 2, DataLayout> t_right(10000, 10);
391  Tensor<float, 2, DataLayout> t_result(2, 10);
392 
393  t_left.setRandom();
394  t_right.setRandom();
395  // Put trash in t_result to verify contraction clears output memory.
396  t_result.setRandom();
397 
398  // Add a little offset so that the results won't be close to zero.
399  t_left += t_left.constant(1.0f);
400  t_right += t_right.constant(1.0f);
401 
403  MapXf m_left(t_left.data(), 2, 10000);
404  MapXf m_right(t_right.data(), 10000, 10);
406 
407  // this contraction should be equivalent to a single matrix multiplication
408  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
409 
410  // compute results by separate methods
411  t_result.device(device) = t_left.contract(t_right, dims);
412  m_result = m_left * m_right;
413 
414  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
415  VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
416  }
417 }
418 
419 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
420 template <int DataLayout>
422 {
424 
425  const int num_threads = internal::random<int>(4, 16);
426  ThreadPool threads(num_threads);
427  Eigen::ThreadPoolDevice device(&threads, num_threads);
428 
429  Tensor<float, 2, DataLayout> t_left(2, 10000);
430  Tensor<float, 2, DataLayout> t_right(10000, 10);
431  Tensor<float, 2, DataLayout> t_result(2, 10);
432 
433  t_left.setRandom();
434  t_right.setRandom();
435  // Put trash in t_result to verify contraction clears output memory.
436  t_result.setRandom();
437 
438  // Add a little offset so that the results won't be close to zero.
439  t_left += t_left.constant(1.0f);
440  t_right += t_right.constant(1.0f);
441 
443  MapXf m_left(t_left.data(), 2, 10000);
444  MapXf m_right(t_right.data(), 10000, 10);
446 
447  // this contraction should be equivalent to a single matrix multiplication
448  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
449 
450  // compute results by separate methods
451  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
452  m_result = m_left * m_right;
453 
454  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
455  VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
456  }
457 }
458 
459 // We are triggering 'evalShardedByInnerDim' optimization.
460 template <int DataLayout>
462 {
464 
465  const int num_threads = internal::random<int>(4, 16);
466  ThreadPool threads(num_threads);
467  Eigen::ThreadPoolDevice device(&threads, num_threads);
468 
469  Tensor<float, 2, DataLayout> t_left(2, 10000);
470  Tensor<float, 2, DataLayout> t_right(10000, 10);
471  Tensor<float, 2, DataLayout> t_result(2, 10);
472 
473  t_left.setRandom();
474  t_right.setRandom();
475  // Put trash in t_result to verify contraction clears output memory.
476  t_result.setRandom();
477 
478  // Add a little offset so that the results won't be close to zero.
479  t_left += t_left.constant(1.0f);
480  t_right += t_right.constant(1.0f);
481 
483  MapXf m_left(t_left.data(), 2, 10000);
484  MapXf m_right(t_right.data(), 10000, 10);
486 
487  // this contraction should be equivalent to a single matrix multiplication
488  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
489 
490  // compute results by separate methods
491  Eigen::Barrier barrier(1);
492  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
493  t_left.contract(t_right, dims);
494  barrier.Wait();
495 
496  m_result = m_left * m_right;
497 
498  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
499  VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
500  }
501 }
502 
503 // We are triggering 'evalShardedByInnerDim' optimization with output kernel.
504 template <int DataLayout>
506 {
508 
509  const int num_threads = internal::random<int>(4, 16);
510  ThreadPool threads(num_threads);
511  Eigen::ThreadPoolDevice device(&threads, num_threads);
512 
513  Tensor<float, 2, DataLayout> t_left(2, 10000);
514  Tensor<float, 2, DataLayout> t_right(10000, 10);
515  Tensor<float, 2, DataLayout> t_result(2, 10);
516 
517  t_left.setRandom();
518  t_right.setRandom();
519  // Put trash in t_result to verify contraction clears output memory.
520  t_result.setRandom();
521 
522  // Add a little offset so that the results won't be close to zero.
523  t_left += t_left.constant(1.0f);
524  t_right += t_right.constant(1.0f);
525 
527  MapXf m_left(t_left.data(), 2, 10000);
528  MapXf m_right(t_right.data(), 10000, 10);
530 
531  // this contraction should be equivalent to a single matrix multiplication
532  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
533 
534  // compute results by separate methods
535  Eigen::Barrier barrier(1);
536  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
537  t_left.contract(t_right, dims, SqrtOutputKernel());
538  barrier.Wait();
539  m_result = m_left * m_right;
540 
541  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
542  VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
543  }
544 }
545 
546 template<int DataLayout>
548  int contract_size1 = internal::random<int>(1, 500);
549  int contract_size2 = internal::random<int>(1, 500);
550 
551  Tensor<float, 2, DataLayout> left(contract_size1,
552  contract_size2);
553  Tensor<float, 2, DataLayout> right(contract_size1,
554  contract_size2);
555  left.setRandom();
556  right.setRandom();
557 
558  // add constants to shift values away from 0 for more precision
559  left += left.constant(1.5f);
560  right += right.constant(1.5f);
561 
563  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
564 
565  Eigen::ThreadPool tp(internal::random<int>(2, 11));
566  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
567 
569  st_result = left.contract(right, dims);
570 
572  tp_result.device(thread_pool_device) = left.contract(right, dims);
573 
574  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
575  // if both of the values are very small, then do nothing (because the test will fail
576  // due to numerical precision issues when values are small)
577  if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
578  VERIFY_IS_APPROX(st_result(), tp_result());
579  }
580 }
581 
582 template<int DataLayout>
584  const int num_threads = internal::random<int>(3, 11);
585  ThreadPool thread_pool(num_threads);
586  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
587 
588  const int num_rows = internal::random<int>(13, 732);
589  const int num_cols = internal::random<int>(13, 732);
590  Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
591  t1.setRandom();
592 
593  Tensor<float, 0, DataLayout> full_redux;
594  full_redux = t1.sum();
595 
596  Tensor<float, 0, DataLayout> full_redux_tp;
597  full_redux_tp.device(thread_pool_device) = t1.sum();
598 
599  // Check that the single threaded and the multi threaded reductions return
600  // the same result.
601  VERIFY_IS_APPROX(full_redux(), full_redux_tp());
602 }
603 
604 
605 void test_memcpy() {
606 
607  for (int i = 0; i < 5; ++i) {
608  const int num_threads = internal::random<int>(3, 11);
609  Eigen::ThreadPool tp(num_threads);
610  Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
611 
612  const int size = internal::random<int>(13, 7632);
614  t1.setRandom();
615  std::vector<float> result(size);
616  thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
617  for (int j = 0; j < size; j++) {
618  VERIFY_IS_EQUAL(t1(j), result[j]);
619  }
620  }
621 }
622 
623 
625 {
626  Eigen::ThreadPool tp(2);
627  Eigen::ThreadPoolDevice device(&tp, 2);
628  Tensor<float, 1> t(1 << 20);
629  t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
630 }
631 
632 template<int DataLayout>
633 void test_multithread_shuffle(Allocator* allocator)
634 {
635  Tensor<float, 4, DataLayout> tensor(17,5,7,11);
636  tensor.setRandom();
637 
638  const int num_threads = internal::random<int>(2, 11);
639  ThreadPool threads(num_threads);
640  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
641 
642  Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
643  array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
644  shuffle.device(device) = tensor.shuffle(shuffles);
645 
646  for (int i = 0; i < 17; ++i) {
647  for (int j = 0; j < 5; ++j) {
648  for (int k = 0; k < 7; ++k) {
649  for (int l = 0; l < 11; ++l) {
650  VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
651  }
652  }
653  }
654  }
655 }
656 
658 {
659  const int num_threads = internal::random<int>(2, 11);
660  const int num_allocs = internal::random<int>(2, 11);
661  ThreadPool threads(num_threads);
662  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
663 
664  for (int a = 0; a < num_allocs; ++a) {
665  void* ptr = device.allocate(512);
666  device.deallocate(ptr);
667  }
668  VERIFY(allocator != NULL);
669  VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
670  VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
671 }
672 
673 EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
674 {
678 
679  CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
680  CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
681 
682  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
683  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
684  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
685  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
686 
687  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
688  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
689 
690  // Test EvalShardedByInnerDimContext parallelization strategy.
691  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
692  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
693  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
694  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
695 
696  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
697  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
698  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
699  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
700 
701  // Exercise various cases that have been problematic in the past.
702  CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
703  CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
704 
705  CALL_SUBTEST_8(test_full_contraction<ColMajor>());
706  CALL_SUBTEST_8(test_full_contraction<RowMajor>());
707 
708  CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
709  CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
710 
713 
714  TestAllocator test_allocator;
715  CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
716  CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
717  CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
718 
719  // Force CMake to split this test.
720  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
721 }
SqrtOutputKernel::operator()
EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper< Scalar, Index, ColMajor > &output_mapper, const TensorContractionParams &, Index, Index, Index num_rows, Index num_cols) const
Definition: cxx11_tensor_thread_pool.cpp:281
Eigen::Tensor
The tensor class.
Definition: Tensor.h:63
test_async_sharded_by_inner_dim_contraction_with_output_kernel
static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
Definition: cxx11_tensor_thread_pool.cpp:505
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
e
Array< double, 1, 3 > e(1./3., 0.5, 2.)
Eigen::array
Definition: EmulateArray.h:21
VERIFY_IS_EQUAL
#define VERIFY_IS_EQUAL(a, b)
Definition: main.h:386
b
Scalar * b
Definition: benchVecAdd.cpp:17
TestAllocator::alloc_count
int alloc_count() const
Definition: cxx11_tensor_thread_pool.cpp:31
test_memcpy
void test_memcpy()
Definition: cxx11_tensor_thread_pool.cpp:605
Eigen::internal::isApprox
EIGEN_DEVICE_FUNC bool isApprox(const Scalar &x, const Scalar &y, const typename NumTraits< Scalar >::Real &precision=NumTraits< Scalar >::dummy_precision())
Definition: Eigen/src/Core/MathFunctions.h:1947
test_full_contraction
void test_full_contraction()
Definition: cxx11_tensor_thread_pool.cpp:547
buffer
Definition: pytypes.h:2270
Eigen::internal::aligned_malloc
EIGEN_DEVICE_FUNC void * aligned_malloc(std::size_t size)
Definition: Memory.h:174
isnan
#define isnan(X)
Definition: main.h:93
test_contraction_corner_cases
void test_contraction_corner_cases()
Definition: cxx11_tensor_thread_pool.cpp:150
Eigen::Barrier::Wait
void Wait()
Definition: Barrier.h:40
Eigen::TensorBase< Tensor< Scalar_, NumIndices_, Options_, IndexType_ > >::shuffle
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE TensorShufflingOp< const Shuffle, const Tensor< Scalar_, NumIndices_, Options_, IndexType_ > > shuffle(const Shuffle &shfl) const
Definition: TensorBase.h:1123
CALL_SUBTEST_11
#define CALL_SUBTEST_11(FUNC)
Definition: split_test_helper.h:64
CALL_SUBTEST_9
#define CALL_SUBTEST_9(FUNC)
Definition: split_test_helper.h:52
result
Values result
Definition: OdometryOptimize.cpp:8
Eigen::TensorContractionParams
Definition: TensorContraction.h:281
size
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
EIGEN_OVERRIDE
#define EIGEN_OVERRIDE
Definition: Macros.h:1449
Eigen::dimensions_match
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
Definition: TensorDimensions.h:484
CALL_SUBTEST_4
#define CALL_SUBTEST_4(FUNC)
Definition: split_test_helper.h:22
TestAllocator::dealloc_count
int dealloc_count() const
Definition: cxx11_tensor_thread_pool.cpp:32
TestAllocator::dealloc_count_
int dealloc_count_
Definition: cxx11_tensor_thread_pool.cpp:36
test_multithread_contraction_with_output_kernel
static void test_multithread_contraction_with_output_kernel()
Definition: cxx11_tensor_thread_pool.cpp:294
CALL_SUBTEST_3
#define CALL_SUBTEST_3(FUNC)
Definition: split_test_helper.h:16
SqrtOutputKernel
Definition: cxx11_tensor_contraction.cpp:515
TestAllocator::deallocate
EIGEN_DEVICE_FUNC void deallocate(void *buffer) const EIGEN_OVERRIDE
Definition: cxx11_tensor_thread_pool.cpp:26
CALL_SUBTEST_1
#define CALL_SUBTEST_1(FUNC)
Definition: split_test_helper.h:4
EIGEN_DECLARE_TEST
EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
Definition: cxx11_tensor_thread_pool.cpp:673
Eigen::internal::aligned_free
EIGEN_DEVICE_FUNC void aligned_free(void *ptr)
Definition: Memory.h:198
TestAllocator::allocate
EIGEN_DEVICE_FUNC void * allocate(size_t num_bytes) const EIGEN_OVERRIDE
Definition: cxx11_tensor_thread_pool.cpp:22
j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
test_multithreaded_reductions
void test_multithreaded_reductions()
Definition: cxx11_tensor_thread_pool.cpp:583
Eigen::Tensor::resize
EIGEN_DEVICE_FUNC void resize(const array< Index, NumIndices > &dimensions)
Definition: Tensor.h:447
left
static char left
Definition: blas_interface.hh:62
Eigen::Tensor::dimensions
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Dimensions & dimensions() const
Definition: Tensor.h:102
test_async_multithread_elementwise
void test_async_multithread_elementwise()
Definition: cxx11_tensor_thread_pool.cpp:61
test_multithread_random
void test_multithread_random()
Definition: cxx11_tensor_thread_pool.cpp:624
CALL_SUBTEST_10
#define CALL_SUBTEST_10(FUNC)
Definition: split_test_helper.h:58
l
static const Line3 l(Rot3(), 1, 1)
TestAllocator::~TestAllocator
~TestAllocator() EIGEN_OVERRIDE
Definition: cxx11_tensor_thread_pool.cpp:21
test_multithread_contraction
void test_multithread_contraction()
Definition: cxx11_tensor_thread_pool.cpp:110
Eigen::DSizes::TotalSize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition: TensorDimensions.h:271
test_multithread_shuffle
void test_multithread_shuffle(Allocator *allocator)
Definition: cxx11_tensor_thread_pool.cpp:633
CALL_SUBTEST_5
#define CALL_SUBTEST_5(FUNC)
Definition: split_test_helper.h:28
EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
Eigen::ThreadPoolTempl
Definition: NonBlockingThreadPool.h:16
CALL_SUBTEST_6
#define CALL_SUBTEST_6(FUNC)
Definition: split_test_helper.h:34
CALL_SUBTEST_2
#define CALL_SUBTEST_2(FUNC)
Definition: split_test_helper.h:10
Eigen::Map
A matrix or vector expression mapping an existing array of data.
Definition: Map.h:94
out
std::ofstream out("Result.txt")
test_multithread_contraction_agrees_with_singlethread
void test_multithread_contraction_agrees_with_singlethread()
Definition: cxx11_tensor_thread_pool.cpp:237
test_async_sharded_by_inner_dim_contraction
static void test_async_sharded_by_inner_dim_contraction()
Definition: cxx11_tensor_thread_pool.cpp:461
tree::f
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
Definition: testExpression.cpp:218
VERIFY_IS_APPROX
#define VERIFY_IS_APPROX(a, b)
Definition: integer_types.cpp:15
Eigen::TensorBase< Tensor< Scalar_, NumIndices_, Options_, IndexType_ > >::setRandom
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setRandom()
Definition: TensorBase.h:996
test_threadpool_allocate
void test_threadpool_allocate(TestAllocator *allocator)
Definition: cxx11_tensor_thread_pool.cpp:657
right
static char right
Definition: blas_interface.hh:61
a
ArrayXXi a
Definition: Array_initializer_list_23_cxx11.cpp:1
test_multithread_compound_assignment
void test_multithread_compound_assignment()
Definition: cxx11_tensor_thread_pool.cpp:86
Eigen::TensorBase< Tensor< Scalar_, NumIndices_, Options_, IndexType_ > >::device
TensorDevice< Tensor< Scalar_, NumIndices_, Options_, IndexType_ >, DeviceType > device(const DeviceType &dev)
Definition: TensorBase.h:1145
main.h
TestAllocator
Definition: cxx11_tensor_thread_pool.cpp:19
Eigen::Tensor::data
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: Tensor.h:104
Eigen::PlainObjectBase< Matrix< _Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols > >::data
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Scalar * data() const
Definition: PlainObjectBase.h:247
Eigen::Matrix
The matrix class, also used for vectors and row-vectors.
Definition: 3rdparty/Eigen/Eigen/src/Core/Matrix.h:178
Eigen::Tensor::size
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const
Definition: Tensor.h:103
abs
#define abs(x)
Definition: datatypes.h:17
NULL
#define NULL
Definition: ccolamd.c:609
test_multithread_elementwise
void test_multithread_elementwise()
Definition: cxx11_tensor_thread_pool.cpp:39
Eigen::Barrier
Definition: Barrier.h:18
CALL_SUBTEST_7
#define CALL_SUBTEST_7(FUNC)
Definition: split_test_helper.h:40
align_3::t
Point2 t(10, 10)
CALL_SUBTEST_8
#define CALL_SUBTEST_8(FUNC)
Definition: split_test_helper.h:46
test_sharded_by_inner_dim_contraction_with_output_kernel
static void test_sharded_by_inner_dim_contraction_with_output_kernel()
Definition: cxx11_tensor_thread_pool.cpp:421
DimPair
Tensor< float, 1 >::DimensionPair DimPair
Definition: cxx11_tensor_contraction.cpp:17
Eigen::internal::NormalRandomGenerator
Definition: TensorRandom.h:238
ceres::sqrt
Jet< T, N > sqrt(const Jet< T, N > &f)
Definition: jet.h:418
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
TestAllocator::alloc_count_
int alloc_count_
Definition: cxx11_tensor_thread_pool.cpp:35
test_async_multithread_contraction_agrees_with_singlethread
void test_async_multithread_contraction_agrees_with_singlethread()
Definition: cxx11_tensor_thread_pool.cpp:334
Eigen::Barrier::Notify
void Notify()
Definition: Barrier.h:25
VERIFY
#define VERIFY(a)
Definition: main.h:380
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
test_sharded_by_inner_dim_contraction
static void test_sharded_by_inner_dim_contraction()
Definition: cxx11_tensor_thread_pool.cpp:381


gtsam
Author(s):
autogenerated on Tue Jan 7 2025 04:02:07