Program Listing for File factorize.hpp

↰ Return to documentation for file (/tmp/ws/src/proxsuite/include/proxsuite/linalg/sparse/factorize.hpp)
//
// Copyright (c) 2022 INRIA
//
#ifndef PROXSUITE_LINALG_SPARSE_LDLT_FACTORIZE_HPP
#define PROXSUITE_LINALG_SPARSE_LDLT_FACTORIZE_HPP

#include "proxsuite/linalg/sparse/core.hpp"
#include <Eigen/OrderingMethods>

namespace proxsuite {
namespace linalg {
namespace sparse {

template<typename I>
auto
transpose_req(proxsuite::linalg::veg::Tag<I> /*tag*/, isize nrows) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { nrows * isize(sizeof(I)), isize(alignof(I)) };
}

// at = a.T
template<typename T, typename I>
void
transpose( //
  MatMut<T, I> at,
  MatRef<T, I> a,
  DynStackMut stack) noexcept(VEG_CONCEPT(nothrow_copyable<T>))
{
  using namespace _detail;

  VEG_ASSERT_ALL_OF( //
    at.is_compressed(),
    at.nrows() == a.ncols(),
    at.ncols() == a.nrows(),
    at.nnz() == a.nnz());

  auto pai = a.row_indices();
  auto pax = a.values();

  auto patp = at.col_ptrs_mut();
  auto pati = at.row_indices_mut();
  auto patx = at.values_mut();

  auto _work = stack.make_new(proxsuite::linalg::veg::Tag<I>{}, at.ncols());
  auto work = _work.ptr_mut();

  // work[i] = num zeros in ith row of A
  if (a.is_compressed()) {
    for (usize p = 0; p < usize(a.nnz()); ++p) {
      util::wrapping_inc(mut(work[util::zero_extend(pai[p])]));
    }
  } else {
    for (usize j = 0; j < a.ncols(); ++j) {
      isize col_start = a.col_start(j);
      isize col_end = a.col_end(j);
      for (isize p = col_start; p < col_end; ++p) {
        util::wrapping_inc(mut(work[util::zero_extend(pai[p])]));
      }
    }
  }

  // compute the cumulative sum
  for (usize j = 0; j < usize(at.ncols()); ++j) {
    patp[j + 1] = util::checked_non_negative_plus(patp[j], work[j]);
    work[j] = patp[j];
  }

  for (usize j = 0; j < usize(a.ncols()); ++j) {
    auto col_start = a.col_start(j);
    auto col_end = a.col_end(j);
    for (usize p = col_start; p < col_end; ++p) {
      auto i = util::zero_extend(pai[p]);
      auto q = util::zero_extend(work[i]);

      pati[q] = j;
      patx[q] = pax[p];
      util::wrapping_inc(mut(work[i]));
    }
  }
}

template<typename I>
auto
transpose_symbolic_req(proxsuite::linalg::veg::Tag<I> /*tag*/,
                       isize nrows) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { nrows * isize(sizeof(I)), isize(alignof(I)) };
}

template<typename I>
void
transpose_symbolic( //
  SymbolicMatMut<I> at,
  SymbolicMatRef<I> a,
  DynStackMut stack) noexcept
{
  using namespace _detail;

  VEG_ASSERT_ALL_OF( //
    at.is_compressed(),
    at.nrows() == a.ncols(),
    at.ncols() == a.nrows(),
    at.nnz() == a.nnz());

  auto pai = a.row_indices();

  auto patp = at.col_ptrs_mut();
  auto pati = at.row_indices_mut();

  auto _work = stack.make_new(proxsuite::linalg::veg::Tag<I>{}, at.ncols());
  auto work = _work.ptr_mut();

  // work[i] = num zeros in ith row of A
  for (usize p = 0; p < usize(a.nnz()); ++p) {
    util::wrapping_inc(mut(work[util::zero_extend(pai[p])]));
  }

  // compute the cumulative sum
  for (usize j = 0; j < usize(at.ncols()); ++j) {
    patp[j + 1] = util::checked_non_negative_plus(patp[j], work[j]);
    work[j] = patp[j];
  }

  for (usize j = 0; j < usize(a.ncols()); ++j) {
    auto col_start = a.col_start(j);
    auto col_end = a.col_end(j);
    for (usize p = col_start; p < col_end; ++p) {
      auto i = util::zero_extend(pai[p]);
      auto q = util::zero_extend(work[i]);

      pati[q] = I(j);
      util::wrapping_inc(mut(work[i]));
    }
  }
}

template<typename T, typename I>
void
dense_lsolve(DenseVecMut<T> x, MatRef<T, I> l) noexcept(false)
{
  using namespace _detail;

  VEG_ASSERT_ALL_OF( //
    l.nrows() == l.ncols(),
    x.nrows() == l.nrows()
    /* l is unit lower triangular */
  );

  usize n = usize(l.nrows());

  auto pli = l.row_indices();
  auto plx = l.values();

  auto px = x.as_slice_mut().ptr_mut();

  for (usize j = 0; j < n; ++j) {
    auto const xj = px[j];
    auto col_start = l.col_start(j);
    auto col_end = l.col_end(j);

    // skip the diagonal entry
    for (usize p = col_start + 1; p < col_end; ++p) {
      auto i = util::zero_extend(pli[p]);
      px[i] -= plx[p] * xj;
    }
  }
}

template<typename T, typename I>
void
dense_ltsolve(DenseVecMut<T> x, MatRef<T, I> l) noexcept(false)
{
  using namespace _detail;

  VEG_ASSERT_ALL_OF( //
    l.nrows() == l.ncols(),
    x.nrows() == l.nrows()
    /* l is unit lower triangular */
  );

  usize n = usize(l.nrows());

  auto pli = l.row_indices();
  auto plx = l.values();

  auto px = x.as_slice_mut().ptr_mut();

  usize j = n;
  while (true) {
    if (j == 0) {
      break;
    }
    --j;

    auto col_start = l.col_start(j);
    auto col_end = l.col_end(j);
    T acc0 = 0;
    T acc1 = 0;
    T acc2 = 0;
    T acc3 = 0;

    // skip the diagonal entry
    usize pstart = col_start + 1;
    usize pcount = col_end - pstart;

    usize p = pstart;
    for (; p < pstart + pcount / 4 * 4; p += 4) {
      auto i0 = util::zero_extend(pli[p + 0]);
      auto i1 = util::zero_extend(pli[p + 1]);
      auto i2 = util::zero_extend(pli[p + 2]);
      auto i3 = util::zero_extend(pli[p + 3]);
      acc0 += plx[p + 0] * px[i0];
      acc1 += plx[p + 1] * px[i1];
      acc2 += plx[p + 2] * px[i2];
      acc3 += plx[p + 3] * px[i3];
    }
    for (; p < pstart + pcount; ++p) {
      auto i0 = util::zero_extend(pli[p + 0]);
      acc0 += plx[p + 0] * px[i0];
    }

    acc0 = (acc0 + acc1) + (acc2 + acc3);

    px[j] -= acc0;
  }
}

template<typename I>
auto
etree_req(proxsuite::linalg::veg::Tag<I> /*tag*/, isize n) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { n * isize{ sizeof(I) }, alignof(I) };
}

template<typename I>
VEG_INLINE void
etree( //
  I* parent,
  SymbolicMatRef<I> a,
  DynStackMut stack) noexcept
{
  using namespace _detail;

  usize n = usize(a.ncols());
  auto pai = a.row_indices();

  auto _work =
    stack.make_new_for_overwrite(proxsuite::linalg::veg::Tag<I>{}, isize(n));
  auto pancestors = _work.ptr_mut();

  // for each column of a
  for (usize k = 0; k < n; ++k) {
    parent[k] = I(-1);
    pancestors[k] = I(-1);
    // assuming elimination subtree T_{k-1} is known, compute T_k

    auto col_start = a.col_start(k);
    auto col_end = a.col_end(k);

    // for each non zero element of a
    for (usize p = col_start; p < col_end; ++p) {
      // get the row
      auto i = util::zero_extend(pai[p]);
      // skip if looking at lower triangular half
      if (i >= k) {
        continue;
      }

      // go up towards the root of the tree
      usize node = i;
      auto next = usize(-1);
      while (true) {
        if (node == usize(-1) || node >= k) {
          break;
        }

        // use the highest known ancestor instead of the parent
        next = util::sign_extend(pancestors[node]);

        // set the highest known ancestor to k, since we know we're going
        // to find it eventually
        pancestors[node] = I(k);

        // if there is no highest known ancestor, we must have hit the root of
        // the tree
        // set the parent to k
        if (next == usize(-1)) {
          parent[node] = I(k);
          break;
        }
        // go to the highest ancestor
        node = next;
      }
    }
  }
}

namespace _detail {
inline auto
ereach_req(isize k) noexcept -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { (k + 1) * isize{ sizeof(bool) }, alignof(bool) };
}

// compute the set of reachable nodes from the non zero pattern of a_{.,k}
// not including the node k itself
template<typename I>
VEG_NODISCARD VEG_INLINE auto
ereach(usize& count,
       I* s,
       SymbolicMatRef<I> a,
       I const* parent,
       isize k,
       bool* pmarked) noexcept -> I*
{

  usize n = usize(a.ncols());
  auto k_ = usize(k);

  auto pai = a.row_indices();
  auto pparent = parent;

  auto col_start = a.col_start(k_);
  auto col_end = a.col_end(k_);

  pmarked[k_] = true;

  usize top = n;

  // for each non zero element of a_{.,k}
  for (usize p = col_start; p < col_end; ++p) {
    auto i = util::zero_extend(pai[p]);

    // only triu part of a
    if (i > k_) {
      continue;
    }

    usize len = 0;
    while (true) {
      // if we reach a marked node
      if (pmarked[i]) {
        break;
      }

      // can't overwrite top of the stack since elements of s are unique
      // and s is large enough to hold all the nodes
      s[isize(len)] = I(i);
      util::wrapping_inc(mut(len));

      // mark node i as reached
      pmarked[i] = true;
      i = util::sign_extend(pparent[i]);
    }

    // make sure that we can memmove
    std::memmove( //
      s + (top - len),
      s,
      usize(len) * sizeof(I));

    // move down the top of the stack
    top = util::wrapping_plus(top, -len);
  }

  for (usize q = top; q < n; ++q) {
    pmarked[s[q]] = false;
  }
  pmarked[k_] = false;

  // [top, end[
  count = n - top;
  return s + top;
}
} // namespace _detail

namespace _detail {
// return the next start_index
template<typename I>
VEG_INLINE auto
postorder_depth_first_search( //
  I* post,
  usize root,
  usize start_index,
  I* pstack,
  I* pfirst_child,
  I* pnext_child) noexcept -> usize
{
  using namespace _detail;

  usize top = 0;
  pstack[0] = I(root);

  // stack is non empty
  while (top != usize(-1)) {
    auto current_node = util::zero_extend(pstack[top]);
    auto current_child = util::sign_extend(pfirst_child[current_node]);

    // no more children
    if (current_child == usize(-1)) {
      post[start_index] = I(current_node);
      ++start_index;

      // pop node from the stack
      util::wrapping_dec(mut(top));
    } else {
      // add current child to the stack
      util::wrapping_inc(mut(top));
      pstack[top] = I(current_child);

      // next child is now the first child
      pfirst_child[current_node] = pnext_child[current_child];
    }
  }
  return start_index;
}
} // namespace _detail

template<typename I>
auto
postorder_req(proxsuite::linalg::veg::Tag<I> /*tag*/, isize n) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { (3 * n) * isize(sizeof(I)), alignof(I) };
}

template<typename I>
void
postorder(I* post, I const* parent, isize n, DynStackMut stack) noexcept
{
  using namespace _detail;

  auto _work = stack.make_new_for_overwrite(proxsuite::linalg::veg::Tag<I>{},
                                            3 * isize(n));
  I* pwork = _work.ptr_mut();

  I* pstack = pwork;
  I* pfirst_child = pstack + n;
  I* pnext_child = pfirst_child + n;

  // no children are found yet
  for (usize j = 0; j < usize(n); ++j) {
    pfirst_child[j] = I(-1);
  }

  for (usize _j = 0; _j < usize(n); ++_j) {
    // traverse in reverse order, since the children appear in reverse order
    // of insertion in the linked list
    usize j = usize(n) - 1 - _j;

    // if not a root node
    if (parent[isize(j)] != I(-1)) {
      // next child of this node is the previous first child
      pnext_child[j] = pfirst_child[util::zero_extend(parent[isize(j)])];
      // set this node to be the new first child
      pfirst_child[util::zero_extend(parent[isize(j)])] = I(j);
    }
  }

  usize start_index = 0;
  for (usize root = 0; root < usize(n); ++root) {
    if (parent[isize(root)] == I(-1)) {
      start_index = _detail::postorder_depth_first_search(
        post, root, start_index, pstack, pfirst_child, pnext_child);
    }
  }
}

namespace _detail {
// returns -2 if j is not a leaf
// returns -1 if j is a first leaf
// returns the least common ancestor of j and the previous j otherwise
template<typename I>
VEG_INLINE auto
least_common_ancestor(usize i,
                      usize j,
                      I const* pfirst,
                      I* pmax_first,
                      I* pprev_leaf,
                      I* pancestor) noexcept -> I
{
  using namespace _detail;

  // if upper triangular part, or not a leaf
  // leaves always have a new larger value of pfirst
  // add 1 to get the correct result when comparing with -1
  if (i <= j || util::wrapping_plus(pfirst[j], I(1)) <=
                  util::wrapping_plus(pmax_first[i], I(1))) {
    return I(-2);
  }

  // update the largest max_first
  // no need to compare because the value of j increases
  // inbetween successive calls, and so does first_j
  pmax_first[i] = pfirst[j];

  // get the previous j
  usize j_prev = util::sign_extend(pprev_leaf[i]);

  // set the previous j to the current j
  pprev_leaf[i] = I(j);

  // if first leaf
  if (j_prev == usize(-1)) {
    return I(-1);
  }

  // else, subsequent leaf
  // get the least common ancestor of j and j_prev
  usize lca = j_prev;
  while (true) {
    if (lca == util::zero_extend(pancestor[lca])) {
      break;
    }
    lca = util::zero_extend(pancestor[lca]);
  }

  // compress the path to speed up the subsequent calls
  // to this function
  usize node = j_prev;
  while (true) {
    if (node == lca) {
      break;
    }
    usize next = util::zero_extend(pancestor[node]);
    pancestor[node] = I(lca);
    node = next;
  }

  return I(lca);
}
} // namespace _detail

template<typename I>
auto
column_counts_req(proxsuite::linalg::veg::Tag<I> tag,
                  isize n,
                  isize nnz) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  using proxsuite::linalg::veg::dynstack::StackReq;
  return StackReq{
    isize{ sizeof(I) } * (1 + 5 * n + nnz),
    alignof(I),
  } & sparse::transpose_symbolic_req(tag, n);
}

template<typename I>
void
column_counts(I* counts,
              SymbolicMatRef<I> a,
              I const* parent,
              I const* post,
              DynStackMut stack) noexcept
{
  // https://youtu.be/uZKJPTo4dZs
  using namespace _detail;
  usize n = usize(a.nrows());
  auto _at_work = stack.make_new_for_overwrite(proxsuite::linalg::veg::Tag<I>{},
                                               1 + 5 * isize(n) + a.nnz());
  auto pat_work = _at_work.ptr_mut();
  pat_work[0] = 0;
  pat_work[n] = I(a.nnz());

  SymbolicMatMut<I> at{
    from_raw_parts, isize(n), isize(n),         a.nnz(),
    pat_work,       nullptr,  pat_work + n + 1,
  };
  sparse::transpose_symbolic(at, a, stack);

  auto patp = at.col_ptrs();
  auto pati = at.row_indices();

  auto pwork = pat_work + n + 1 + a.nnz();

  auto pdelta = counts;

  auto pfirst = pwork;
  auto pmax_first = pwork + n;
  auto pprev_leaf = pwork + 2 * n;
  auto pancestor = pwork + 3 * n;

  auto pcounts = counts;
  auto ppost = post;
  auto pparent = parent;

  for (usize i = 0; i < 3 * n; ++i) {
    pwork[i] = I(-1);
  }
  for (usize i = 0; i < n; ++i) {
    pancestor[i] = I(i);
  }

  // for each column in a
  for (usize k = 0; k < n; ++k) {
    // in postordered fashion
    auto j = util::zero_extend(ppost[k]);

    // if first_j isn't computed, j must be a leaf
    // because if it's not a leaf then first_j will be initialized from
    // the init loop of its first descendant
    //
    // in which case initialize delta_j to 1
    pdelta[j] = (pfirst[j] == I(-1)) ? I(1) : I(0);

    // init loop
    while (true) {
      // while j is not a root, and the first descendant of j isn't computed
      // set the first descendant of j to k, as well as all of its ancestors
      // that don't yet have a first descendant
      if (j == usize(-1) || pfirst[j] != I(-1)) {
        break;
      }
      pfirst[j] = I(k);
      j = util::sign_extend(pparent[j]);
    }
  }

  // for each node
  for (usize k = 0; k < n; ++k) {
    // in postordered fashion
    auto j = util::zero_extend(ppost[k]);

    // if this node is the child of some other node
    if (pparent[j] != I(-1)) {
      // decrement the delta of that node
      // corresponding to the correction term e_j
      util::wrapping_dec(mut(pdelta[util::zero_extend(pparent[j])]));
    }

    auto col_start = util::zero_extend(patp[j]);
    auto col_end = util::zero_extend(patp[j + 1]);

    // iterate over lower triangular half of a
    for (usize p = col_start; p < col_end; ++p) {
      auto i = util::zero_extend(pati[p]);
      I lca = _detail::least_common_ancestor( //
        i,
        j,
        pfirst,
        pmax_first,
        pprev_leaf,
        pancestor);

      // if j is a leaf of T^i
      if (lca != I(-2)) {
        util::wrapping_inc(mut(pdelta[j]));

        // if j is a subsequent leaf
        if (lca != I(-1)) {
          util::wrapping_dec(mut(pdelta[util::zero_extend(lca)]));
        }
      }
    }

    if (pparent[j] != -1) {
      // set the ancestor of j
      pancestor[j] = pparent[j];
    }
  }

  // sum up the deltas
  for (usize j = 0; j < n; ++j) {
    if (parent[isize(j)] != I(-1)) {
      pcounts[util::zero_extend(parent[isize(j)])] = util::wrapping_plus(
        pcounts[util::zero_extend(parent[isize(j)])], pcounts[j]);
    }
  }
}

template<typename I>
auto
amd_req(proxsuite::linalg::veg::Tag<I> /*tag*/, isize /*n*/, isize nnz) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { nnz * isize{ sizeof(char) }, alignof(char) };
}

template<typename I>
void
amd(I* perm, SymbolicMatRef<I> mat, DynStackMut stack) noexcept
{
  // TODO: reimplement amd under BSD-3
  // https://github.com/DrTimothyAldenDavis/SuiteSparse/tree/master/AMD

  isize n = mat.nrows();
  isize nnz = mat.nnz();

  Eigen::PermutationMatrix<-1, -1, I> perm_eigen;
  auto _ = stack.make_new(proxsuite::linalg::veg::Tag<char>{}, nnz);

  Eigen::AMDOrdering<I>{}(
    Eigen::Map<Eigen::SparseMatrix<char, Eigen::ColMajor, I> const>{
      n,
      n,
      nnz,
      mat.col_ptrs(),
      mat.row_indices(),
      _.ptr(),
      mat.nnz_per_col(),
    }
      .template selfadjointView<Eigen::Upper>(),

    perm_eigen);
  std::memmove( //
    perm,
    perm_eigen.indices().data(),
    usize(n) * sizeof(I));
}

namespace _detail {
template<typename I>
void
inv_perm(I* perm_inv, I const* perm, isize n) noexcept
{
  for (usize i = 0; i < usize(n); ++i) {
    perm_inv[util::zero_extend(perm[i])] = I(i);
  }
}

template<typename I>
auto
symmetric_permute_symbolic_req(proxsuite::linalg::veg::Tag<I> /*tag*/,
                               isize n) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { n * isize{ sizeof(I) }, alignof(I) };
}
template<typename I>
auto
symmetric_permute_req(proxsuite::linalg::veg::Tag<I> /*tag*/, isize n) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  return { n * isize{ sizeof(I) }, alignof(I) };
}

template<typename I>
void
symmetric_permute_common(usize n,
                         I const* pperm_inv,
                         SymbolicMatRef<I> old_a,
                         I* pnew_ap,
                         I* pcol_counts)
{
  for (usize old_j = 0; old_j < n; ++old_j) {
    usize new_j = util::zero_extend(pperm_inv[old_j]);

    auto col_start = old_a.col_start(old_j);
    auto col_end = old_a.col_end(old_j);

    for (usize p = col_start; p < col_end; ++p) {
      usize old_i = util::zero_extend(old_a.row_indices()[p]);

      if (old_i <= old_j) {
        usize new_i = util::zero_extend(pperm_inv[old_i]);
        util::wrapping_inc(mut(pcol_counts[new_i > new_j ? new_i : new_j]));
      }
    }
  }

  pnew_ap[0] = I(0);
  for (usize i = 0; i < n; ++i) {
    pnew_ap[i + 1] =
      util::checked_non_negative_plus(pnew_ap[i], pcol_counts[i]);
    pcol_counts[i] = pnew_ap[i];
  }
}

template<typename I>
void
symmetric_permute_symbolic(SymbolicMatMut<I> new_a,
                           SymbolicMatRef<I> old_a,
                           I const* perm_inv,
                           DynStackMut stack) noexcept
{

  usize n = usize(new_a.nrows());

  auto _work = stack.make_new(proxsuite::linalg::veg::Tag<I>{}, isize(n));
  I* pcol_counts = _work.ptr_mut();

  VEG_ASSERT(new_a.is_compressed());
  auto pold_ai = old_a.row_indices();

  auto pnew_ap = new_a.col_ptrs_mut();
  auto pnew_ai = new_a.row_indices_mut();

  auto pperm_inv = perm_inv;

  _detail::symmetric_permute_common(n, pperm_inv, old_a, pnew_ap, pcol_counts);

  auto pcurrent_row_index = pcol_counts;

  for (usize old_j = 0; old_j < n; ++old_j) {
    usize new_j = util::zero_extend(pperm_inv[old_j]);

    auto col_start = old_a.col_start(old_j);
    auto col_end = old_a.col_end(old_j);

    for (usize p = col_start; p < col_end; ++p) {
      usize old_i = util::zero_extend(pold_ai[p]);

      if (old_i <= old_j) {
        usize new_i = util::zero_extend(pperm_inv[old_i]);

        usize new_max = new_i > new_j ? new_i : new_j;
        usize new_min = new_i < new_j ? new_i : new_j;

        auto row_idx = pcurrent_row_index[new_max];
        pnew_ai[row_idx] = I(new_min);
        pcurrent_row_index[new_max] = util::wrapping_plus(row_idx, I(1));
      }
    }
  }
}

template<typename T, typename I>
void
symmetric_permute(MatMut<T, I> new_a,
                  MatRef<T, I> old_a,
                  I const* perm_inv,
                  DynStackMut stack) noexcept(VEG_CONCEPT(nothrow_copyable<T>))
{
  usize n = usize(new_a.nrows());
  auto _work = stack.make_new(proxsuite::linalg::veg::Tag<I>{}, isize(n));
  I* pcol_counts = _work.ptr_mut();

  VEG_ASSERT(new_a.is_compressed());
  auto pold_ai = old_a.row_indices();

  auto pnew_ap = new_a.col_ptrs_mut();
  auto pnew_ai = new_a.row_indices_mut();

  auto pperm_inv = perm_inv;

  _detail::symmetric_permute_common(
    n, pperm_inv, old_a.symbolic(), pnew_ap, pcol_counts);

  auto pcurrent_row_index = pcol_counts;

  auto pold_ax = old_a.values();
  auto pnew_ax = new_a.values_mut();
  for (usize old_j = 0; old_j < n; ++old_j) {
    usize new_j = util::zero_extend(pperm_inv[old_j]);

    auto col_start = old_a.col_start(old_j);
    auto col_end = old_a.col_end(old_j);

    for (usize p = col_start; p < col_end; ++p) {
      usize old_i = util::zero_extend(pold_ai[p]);

      if (old_i <= old_j) {
        usize new_i = util::zero_extend(pperm_inv[old_i]);

        usize new_max = new_i > new_j ? new_i : new_j;
        usize new_min = new_i < new_j ? new_i : new_j;

        auto row_idx = pcurrent_row_index[new_max];
        pnew_ai[row_idx] = I(new_min);
        pnew_ax[row_idx] = pold_ax[p];
        pcurrent_row_index[new_max] = util::wrapping_plus(row_idx, I(1));
      }
    }
  }
}
} // namespace _detail

enum struct Ordering : unsigned char
{
  natural,
  user_provided,
  amd,
  ENUM_END,
};

template<typename I>
auto
factorize_symbolic_req(proxsuite::linalg::veg::Tag<I> tag,
                       isize n,
                       isize nnz,
                       Ordering o) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  using proxsuite::linalg::veg::dynstack::StackReq;
  constexpr isize sz{ sizeof(I) };
  constexpr isize al{ alignof(I) };

  StackReq perm_req{ 0, al };
  StackReq amd_req{ 0, al };
  switch (o) {
    case Ordering::natural:
      break;
    case Ordering::amd:
      amd_req =
        StackReq{ n * sz, al } & StackReq{ sparse::amd_req(tag, n, nnz) };
      HEDLEY_FALL_THROUGH;
    case Ordering::user_provided:
      perm_req = perm_req & StackReq{ (n + 1 + nnz) * sz, al };
      perm_req = perm_req & _detail::symmetric_permute_symbolic_req(tag, n);
    default:
      break;
  }

  StackReq parent_req = { n * sz, al };
  StackReq post_req = { n * sz, al };

  StackReq etree_req = sparse::etree_req(tag, n);
  StackReq postorder_req = sparse::postorder_req(tag, n);
  StackReq colcount_req = sparse::column_counts_req(tag, n, nnz);

  return amd_req              //
         | (perm_req          //
            & (parent_req     //
               & (etree_req   //
                  | (post_req //
                     & (postorder_req | colcount_req)))));
}

template<typename I>
void
factorize_symbolic_non_zeros(I* nnz_per_col,
                             I* etree,
                             I* perm_inv,
                             I const* perm,
                             SymbolicMatRef<I> a,
                             DynStackMut stack) noexcept
{

  bool id_perm = perm_inv == nullptr;
  bool user_perm = perm != nullptr;

  Ordering o = user_perm ? Ordering::user_provided
               : id_perm ? Ordering::natural
                         : Ordering::amd;

  proxsuite::linalg::veg::Tag<I> tag{};

  usize n = usize(a.ncols());

  switch (o) {
    case Ordering::natural:
      break;

    case Ordering::amd: {
      auto amd_perm = stack.make_new_for_overwrite(tag, isize(n));
      sparse::amd(amd_perm.ptr_mut(), a, stack);
      perm = amd_perm.ptr();
    }
      HEDLEY_FALL_THROUGH;
    case Ordering::user_provided: {
      _detail::inv_perm(perm_inv, perm, isize(n));
    }
    default:
      break;
  }

  auto _permuted_a_col_ptrs =
    stack //
      .make_new_for_overwrite(tag, id_perm ? 0 : (a.ncols() + 1));
  auto _permuted_a_row_indices =
    stack //
      .make_new_for_overwrite(tag, id_perm ? 0 : (a.nnz()));

  if (!id_perm) {
    _permuted_a_col_ptrs.as_mut()[0] = 0;
    _permuted_a_col_ptrs.as_mut()[isize(n)] = I(a.nnz());
    SymbolicMatMut<I> permuted_a{
      from_raw_parts,
      isize(n),
      isize(n),
      a.nnz(),
      _permuted_a_col_ptrs.ptr_mut(),
      nullptr,
      _permuted_a_row_indices.ptr_mut(),
    };
    _detail::symmetric_permute_symbolic(permuted_a, a, perm_inv, stack);
  }

  SymbolicMatRef<I> permuted_a = id_perm ? a
                                         : SymbolicMatRef<I>{
                                             from_raw_parts,
                                             isize(n),
                                             isize(n),
                                             a.nnz(),
                                             _permuted_a_col_ptrs.ptr(),
                                             nullptr,
                                             _permuted_a_row_indices.ptr(),
                                           };

  sparse::etree(etree, permuted_a, stack);

  auto _post = stack.make_new_for_overwrite(tag, isize(n));
  sparse::postorder(_post.ptr_mut(), etree, isize(n), stack);
  sparse::column_counts(nnz_per_col, permuted_a, etree, _post.ptr(), stack);
}

template<typename I>
void
factorize_symbolic_col_counts(I* col_ptrs,
                              I* etree,
                              I* perm_inv,
                              I const* perm,
                              SymbolicMatRef<I> a,
                              DynStackMut stack) noexcept
{

  sparse::factorize_symbolic_non_zeros( //
    col_ptrs + 1,
    etree,
    perm_inv,
    perm,
    a,
    stack);

  usize n = usize(a.ncols());
  auto pcol_ptrs = col_ptrs;
  pcol_ptrs[0] = I(0);
  for (usize i = 0; i < n; ++i) {
    pcol_ptrs[i + 1] =
      util::checked_non_negative_plus(pcol_ptrs[i + 1], pcol_ptrs[i]);
  }
}

template<typename T, typename I>
auto
factorize_numeric_req(proxsuite::linalg::veg::Tag<T> /*ttag*/,
                      proxsuite::linalg::veg::Tag<I> /*itag*/,
                      isize n,
                      isize a_nnz,
                      Ordering o) noexcept
  -> proxsuite::linalg::veg::dynstack::StackReq
{
  using proxsuite::linalg::veg::dynstack::StackReq;

  constexpr isize sz{ sizeof(I) };
  constexpr isize al{ alignof(I) };

  constexpr isize tsz{ sizeof(T) };
  constexpr isize tal{ alignof(T) };

  bool id_perm = o == Ordering::natural;

  auto symb_perm_req = StackReq{ sz * (id_perm ? 0 : (n + 1 + a_nnz)), al };
  auto num_perm_req = StackReq{ tsz * (id_perm ? 0 : a_nnz), tal };
  return num_perm_req                         //
         & (StackReq{ tsz * n, tal }          //
            & (symb_perm_req                  //
               & (StackReq{ 2 * n * sz, al }  //
                  & (StackReq{ n * tsz, tal } //
                     & StackReq{ n * isize{ sizeof(bool) }, alignof(bool) }))));
}

template<typename T, typename I>
void
factorize_numeric( //
  T* values,
  I* row_indices,
  proxsuite::linalg::veg::DoNotDeduce<T const*> diag_to_add,
  proxsuite::linalg::veg::DoNotDeduce<I const*> perm,
  I const* col_ptrs,
  I const* etree,
  I const* perm_inv,
  MatRef<T, I> a,
  DynStackMut stack) noexcept(false)
{
  using namespace _detail;
  isize n = a.nrows();

  bool id_perm = perm_inv == nullptr;

  proxsuite::linalg::veg::Tag<I> tag{};

  auto _permuted_a_values = stack.make_new_for_overwrite(
    proxsuite::linalg::veg::Tag<T>{}, id_perm ? 0 : a.nnz());

  auto _x = stack.make_new_for_overwrite(proxsuite::linalg::veg::Tag<T>{}, n);

  auto _permuted_a_col_ptrs =
    stack.make_new_for_overwrite(tag, id_perm ? 0 : (a.ncols() + 1));
  auto _permuted_a_row_indices =
    stack.make_new_for_overwrite(tag, id_perm ? 0 : a.nnz());

  if (!id_perm) {
    _permuted_a_col_ptrs.as_mut()[0] = 0;
    _permuted_a_col_ptrs.as_mut()[n] = I(a.nnz());
    MatMut<T, I> permuted_a{
      from_raw_parts,
      n,
      n,
      a.nnz(),
      _permuted_a_col_ptrs.ptr_mut(),
      nullptr,
      _permuted_a_row_indices.ptr_mut(),
      _permuted_a_values.ptr_mut(),
    };
    _detail::symmetric_permute(permuted_a, a, perm_inv, stack);
  }

  MatRef<T, I> permuted_a = id_perm ? a
                                    : MatRef<T, I>{
                                        from_raw_parts,
                                        isize(n),
                                        isize(n),
                                        a.nnz(),
                                        _permuted_a_col_ptrs.ptr(),
                                        nullptr,
                                        _permuted_a_row_indices.ptr(),
                                        _permuted_a_values.ptr(),
                                      };

  auto _current_row_index = stack.make_new_for_overwrite(tag, n);
  auto _ereach_stack_storage = stack.make_new_for_overwrite(tag, n);

  I* pcurrent_row_index = _current_row_index.ptr_mut();
  T* px = _x.ptr_mut();

  std::memcpy( //
    pcurrent_row_index,
    col_ptrs,
    usize(n) * sizeof(I));
  for (usize i = 0; i < usize(n); ++i) {
    px[i] = 0;
  }

  // compute the iter-th row of L using the iter-th column of permuted_a
  // the diagonal element is filled with the diagonal of D instead of 1
  I const* plp = col_ptrs;

  auto _marked = stack.make_new(proxsuite::linalg::veg::Tag<bool>{}, n);
  for (usize iter = 0; iter < usize(n); ++iter) {
    usize ereach_count = 0;
    auto ereach_stack = _detail::ereach(ereach_count,
                                        _ereach_stack_storage.ptr_mut(),
                                        permuted_a.symbolic(),
                                        etree,
                                        isize(iter),
                                        _marked.ptr_mut());

    auto pereach_stack = ereach_stack;

    I const* pai = permuted_a.row_indices();
    T const* pax = permuted_a.values();

    I* pli = row_indices;
    T* plx = values;

    {
      auto col_start = permuted_a.col_start(iter);
      auto col_end = permuted_a.col_end(iter);

      // scatter permuted_a column into x
      // untouched columns are already zeroed

      for (usize p = col_start; p < col_end; ++p) {
        auto i = util::zero_extend(pai[p]);
        px[i] = pax[p];
      }
    }
    T d = px[iter] + ((diag_to_add == nullptr || perm == nullptr)
                        ? T(0)
                        : diag_to_add[util::zero_extend(perm[iter])]);

    // zero for next iteration
    px[iter] = 0;

    for (usize q = 0; q < ereach_count; ++q) {
      usize j = util::zero_extend(pereach_stack[q]);
      auto col_start = util::zero_extend(plp[j]);
      auto row_idx = util::zero_extend(pcurrent_row_index[j]) + 1;

      T const xj = px[j];
      T const dj = plx[col_start];
      T const lkj = xj / dj;

      // zero for the next iteration
      px[j] = 0;

      // skip first element, to put diagonal there later
      for (usize p = col_start + 1; p < row_idx; ++p) {
        auto i = util::zero_extend(pli[p]);
        px[i] -= plx[p] * xj;
      }

      d -= lkj * xj;

      pli[row_idx] = I(iter);
      plx[row_idx] = lkj;
      pcurrent_row_index[j] = I(row_idx);
    }
    {
      auto col_start = util::zero_extend(plp[iter]);
      pli[col_start] = I(iter);
      plx[col_start] = d;
    }
  }
}
} // namespace sparse
} // namespace linalg
} // namespace proxsuite
#endif /* end of include guard PROXSUITE_LINALG_SPARSE_LDLT_FACTORIZE_HPP */