tensor-compiler.github.io/examples/spadd_assembly.c at master · tensor-compiler/tensor-compiler.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
// taco "A(i,j)=B(i,j)+C(i,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c

int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  int* restrict A2_crd = (int*)(A->indices[1][1]);
  double* restrict A_vals = (double*)(A->vals);
  int B1_dimension = (int)(B->dimensions[0]);
  int* restrict B2_pos = (int*)(B->indices[1][0]);
  int* restrict B2_crd = (int*)(B->indices[1][1]);
  int C1_dimension = (int)(C->dimensions[0]);
  int* restrict C2_pos = (int*)(C->indices[1][0]);
  int* restrict C2_crd = (int*)(C->indices[1][1]);

  int32_t* restrict A2_nnz = 0;
  A2_nnz = (int32_t*)malloc(sizeof(int32_t) * C1_dimension);

  #pragma omp parallel for schedule(runtime)
  for (int32_t i = 0; i < C1_dimension; i++) {
    int32_t tjA2_nnz_val = 0;
    int32_t jB = B2_pos[i];
    int32_t pB2_end = B2_pos[(i + 1)];
    int32_t jC = C2_pos[i];
    int32_t pC2_end = C2_pos[(i + 1)];

    while (jB < pB2_end && jC < pC2_end) {
      int32_t jB0 = B2_crd[jB];
      int32_t jC0 = C2_crd[jC];
      int32_t j = TACO_MIN(jB0,jC0);
      tjA2_nnz_val++;
      jB += (int32_t)(jB0 == j);
      jC += (int32_t)(jC0 == j);
    }
    if (jB < pB2_end) {
      tjA2_nnz_val += pB2_end - jB;
      jB = pB2_end;
    }
    if (jC < pC2_end) {
      tjA2_nnz_val += pC2_end - jC;
      jC = pC2_end;
    }
    A2_nnz[i] = tjA2_nnz_val;
  }

  A2_pos = (int32_t*)malloc(sizeof(int32_t) * (A1_dimension + 1));
  A2_pos[0] = 0;
  for (int32_t i = 0; i < A1_dimension; i++) {
    A2_pos[i + 1] = A2_pos[i] + A2_nnz[i];
  }
  A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_pos[A1_dimension]);
  A_vals = (double*)malloc(sizeof(double) * A2_pos[A1_dimension]);

  #pragma omp parallel for schedule(runtime)
  for (int32_t i = 0; i < C1_dimension; i++) {

    int32_t jB = B2_pos[i];
    int32_t pB2_end = B2_pos[(i + 1)];
    int32_t jC = C2_pos[i];
    int32_t pC2_end = C2_pos[(i + 1)];

    while (jB < pB2_end && jC < pC2_end) {
      int32_t jB0 = B2_crd[jB];
      int32_t jC0 = C2_crd[jC];
      int32_t j = TACO_MIN(jB0,jC0);
      if (jB0 == j && jC0 == j) {
        int32_t pA2 = A2_pos[i];
        A2_pos[i] = A2_pos[i] + 1;
        A2_crd[pA2] = j;
      }
      else if (jB0 == j) {
        int32_t pA20 = A2_pos[i];
        A2_pos[i] = A2_pos[i] + 1;
        A2_crd[pA20] = j;
      }
      else {
        int32_t pA21 = A2_pos[i];
        A2_pos[i] = A2_pos[i] + 1;
        A2_crd[pA21] = j;
      }
      jB += (int32_t)(jB0 == j);
      jC += (int32_t)(jC0 == j);
    }
    while (jB < pB2_end) {
      int32_t j = B2_crd[jB];
      int32_t pA22 = A2_pos[i];
      A2_pos[i] = A2_pos[i] + 1;
      A2_crd[pA22] = j;
      jB++;
    }
    while (jC < pC2_end) {
      int32_t j = C2_crd[jC];
      int32_t pA23 = A2_pos[i];
      A2_pos[i] = A2_pos[i] + 1;
      A2_crd[pA23] = j;
      jC++;
    }
  }

  for (int32_t p = 0; p < A1_dimension; p++) {
    A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
  }
  A2_pos[0] = 0;

  free(A2_nnz);

  A->indices[1][0] = (uint8_t*)(A2_pos);
  A->indices[1][1] = (uint8_t*)(A2_crd);
  A->vals = (uint8_t*)A_vals;
  return 0;
}