IndexType num_copied = 0; \
for (num_copied = 0; num_copied < block_total_size; \
num_copied += dst_inner_dim_size) { \
LinCopy::template Run<KIND>( \
typename LinCopy::Dst(output_offset, output_stride, dst.data), \
typename LinCopy::Src(input_offset, input_stride, src.data), \
for (
int j = 0;
j < idx; ++
j) { \
if (++it[
j].count < it[
j].
size) { \
input_offset += it[
j].input_stride; \
output_offset += it[
j].output_stride; \
break; \
} \
input_offset -= it[
j].input_span; \
output_offset -= it[
j].output_span; \
} \
} \
return num_copied;