25 using namespace gtsam;
 
   29 #include <tbb/blocked_range.h>            
   30 #include <tbb/tick_count.h>               
   31 #include <tbb/parallel_for.h>             
   32 #include <tbb/cache_aligned_allocator.h>  
   33 #include <tbb/task_arena.h>               
   34 #include <tbb/task_group.h>               
   36 static const DenseIndex numberOfProblems = 1000000;
 
   42 struct ResultWithThreads
 
   44   typedef map<int, double>::value_type value_type;
 
   45   map<int, double> grainSizesWithoutAllocation;
 
   46   map<int, double> grainSizesWithAllocation;
 
   49 typedef map<int, ResultWithThreads> Results;
 
   52 struct WorkerWithoutAllocation
 
   58   void operator()(
const tbb::blocked_range<size_t>& r)
 const 
   60     for(
size_t i = r.begin(); 
i != r.end(); ++
i)
 
   62       FixedMatrix 
m1 = FixedMatrix::Random();
 
   63       FixedMatrix 
m2 = FixedMatrix::Random();
 
   71 map<int, double> testWithoutMemoryAllocation(
int num_threads)
 
   76   tbb::task_arena arena(num_threads);
 
   80   vector<double> 
results(numberOfProblems);
 
   82   const vector<size_t> grainSizes = {1, 10, 100, 1000};
 
   83   map<int, double> timingResults;
 
   84   for(
size_t grainSize: grainSizes)
 
   86     tbb::tick_count t0 = tbb::tick_count::now();
 
   91         tbb::parallel_for(tbb::blocked_range<size_t>(0, numberOfProblems), WorkerWithoutAllocation(
results));
 
   95     tbb::tick_count t1 = tbb::tick_count::now();
 
   96     cout << 
"Without memory allocation, grain size = " << grainSize << 
", time = " << (t1 - t0).seconds() << endl;
 
   97     timingResults[(
int)grainSize] = (t1 - t0).seconds();
 
  100   return timingResults;
 
  104 struct WorkerWithAllocation
 
  110   void operator()(
const tbb::blocked_range<size_t>& r)
 const 
  112     tbb::cache_aligned_allocator<double> allocator;
 
  113     for(
size_t i = r.begin(); 
i != r.end(); ++
i)
 
  115       double *m1data = allocator.allocate(problemSize * problemSize);
 
  117       double *m2data = allocator.allocate(problemSize * problemSize);
 
  119       double *proddata = allocator.allocate(problemSize * problemSize);
 
  122       m1 = Eigen::Matrix4d::Random(problemSize, problemSize);
 
  123       m2 = Eigen::Matrix4d::Random(problemSize, problemSize);
 
  127       allocator.deallocate(m1data, problemSize * problemSize);
 
  128       allocator.deallocate(m2data, problemSize * problemSize);
 
  129       allocator.deallocate(proddata, problemSize * problemSize);
 
  135 map<int, double> testWithMemoryAllocation(
int num_threads)
 
  140   tbb::task_arena arena(num_threads);
 
  144   vector<double> 
results(numberOfProblems);
 
  146   const vector<size_t> grainSizes = {1, 10, 100, 1000};
 
  147   map<int, double> timingResults;
 
  148   for(
size_t grainSize: grainSizes)
 
  150     tbb::tick_count t0 = tbb::tick_count::now();
 
  155         tbb::parallel_for(tbb::blocked_range<size_t>(0, numberOfProblems), WorkerWithAllocation(
results));
 
  159     tbb::tick_count t1 = tbb::tick_count::now();
 
  160     cout << 
"With memory allocation, grain size = " << grainSize << 
", time = " << (t1 - t0).seconds() << endl;
 
  161     timingResults[(
int)grainSize] = (t1 - t0).seconds();
 
  164   return timingResults;
 
  168 int main(
int argc, 
char* argv[])
 
  170   cout << 
"numberOfProblems = " << numberOfProblems << endl;
 
  171   cout << 
"problemSize = " << problemSize << endl;
 
  173   const vector<int> numThreads = {1, 4, 8};
 
  176   for(
size_t n: numThreads)
 
  178     cout << 
"With " << 
n << 
" threads:" << endl;
 
  179     results[(
int)
n].grainSizesWithoutAllocation = testWithoutMemoryAllocation((
int)
n);
 
  180     results[(
int)
n].grainSizesWithAllocation = testWithMemoryAllocation((
int)
n);
 
  184   cout << 
"Summary of results:" << endl;
 
  185   for(
const Results::value_type& threads_result: 
results)
 
  187     const int threads = threads_result.first;
 
  188     const ResultWithThreads& 
result = threads_result.second;
 
  191       for(
const ResultWithThreads::value_type& grainsize_time: 
result.grainSizesWithoutAllocation)
 
  193         const int grainsize = grainsize_time.first;
 
  194         const double speedup = 
results[1].grainSizesWithoutAllocation[grainsize] / grainsize_time.second;
 
  195         cout << threads << 
" threads, without allocation, grain size = " << grainsize << 
", speedup = " << speedup << endl;
 
  197       for(
const ResultWithThreads::value_type& grainsize_time: 
result.grainSizesWithAllocation)
 
  199         const int grainsize = grainsize_time.first;
 
  200         const double speedup = 
results[1].grainSizesWithAllocation[grainsize] / grainsize_time.second;
 
  201         cout << threads << 
" threads, with allocation, grain size = " << grainsize << 
", speedup = " << speedup << endl;
 
  212 int main(
int argc, 
char* argv [])
 
  214   cout << 
"GTSAM is compiled without TBB, please compile with TBB to use this program." << endl;