tensor-compiler.github.io/examples/ttv_compute.c at master · tensor-compiler/tensor-compiler.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c

int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  double* restrict A_vals = (double*)(A->vals);
  int* restrict B1_pos = (int*)(B->indices[0][0]);
  int* restrict B1_crd = (int*)(B->indices[0][1]);
  int* restrict B2_pos = (int*)(B->indices[1][0]);
  int* restrict B2_crd = (int*)(B->indices[1][1]);
  int* restrict B3_pos = (int*)(B->indices[2][0]);
  int* restrict B3_crd = (int*)(B->indices[2][1]);
  double* restrict B_vals = (double*)(B->vals);
  int c1_dimension = (int)(c->dimensions[0]);
  double* restrict c_vals = (double*)(c->vals);

  #pragma omp parallel for schedule(static)
  for (int32_t pA = 0; pA < A2_pos[A1_dimension]; pA++) {
    A_vals[pA] = 0.0;
  }

  #pragma omp parallel for schedule(runtime)
  for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
    int32_t i = B1_crd[iB];
    for (int32_t jB = B2_pos[iB]; jB < B2_pos[(iB + 1)]; jB++) {
      double tkA_val = 0.0;
      bool tkA_set = 0;
      for (int32_t kB = B3_pos[jB]; kB < B3_pos[(jB + 1)]; kB++) {
        int32_t k = B3_crd[kB];
        tkA_val += B_vals[kB] * c_vals[k];
        tkA_set = 1;
      }
      if (tkA_set) {
        int32_t pA2 = A2_pos[i];
        A2_pos[i] = A2_pos[i] + 1;
        A_vals[pA2] = tkA_val;
      }
    }
  }

  for (int32_t p = 0; p < A1_dimension; p++) {
    A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
  }
  A2_pos[0] = 0;
  return 0;
}