tensor-compiler.github.io/examples/spgemm_compute.c at master · tensor-compiler/tensor-compiler.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
// taco "A(i,j)=B(i,k)*C(k,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -s="reorder(i,k,j)" -s="precompute(B(i,k)*C(k,j),j,j)" -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c

int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  double* restrict A_vals = (double*)(A->vals);
  int B1_dimension = (int)(B->dimensions[0]);
  int* restrict B2_pos = (int*)(B->indices[1][0]);
  int* restrict B2_crd = (int*)(B->indices[1][1]);
  double* restrict B_vals = (double*)(B->vals);
  int C1_dimension = (int)(C->dimensions[0]);
  int C2_dimension = (int)(C->dimensions[1]);
  int* restrict C2_pos = (int*)(C->indices[1][0]);
  int* restrict C2_crd = (int*)(C->indices[1][1]);
  double* restrict C_vals = (double*)(C->vals);

  double* restrict workspace_all = 0;
  int32_t* restrict workspace_index_list_all = 0;
  workspace_index_list_all = (int32_t*)malloc(sizeof(int32_t) * (C2_dimension * omp_get_max_threads()));
  bool* restrict workspace_already_set_all = calloc((C2_dimension * omp_get_max_threads()), sizeof(bool));
  workspace_all = (double*)malloc(sizeof(double) * (C2_dimension * omp_get_max_threads()));

  #pragma omp parallel for schedule(runtime)
  for (int32_t i = 0; i < B1_dimension; i++) {
    int32_t workspace_index_list_size = 0;
    double* restrict workspace = workspace_all + C2_dimension * omp_get_thread_num();
    int32_t* restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num();
    bool* restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num();
    for (int32_t kB = B2_pos[i]; kB < B2_pos[(i + 1)]; kB++) {
      int32_t k = B2_crd[kB];
      for (int32_t jC = C2_pos[k]; jC < C2_pos[(k + 1)]; jC++) {
        int32_t j = C2_crd[jC];
        if (!workspace_already_set[j]) {
          workspace[j] = B_vals[kB] * C_vals[jC];
          workspace_index_list[workspace_index_list_size] = j;
          workspace_already_set[j] = 1;
          workspace_index_list_size++;
        }
        else {
          workspace[j] = workspace[j] + B_vals[kB] * C_vals[jC];
        }
      }
    }
    qsort(workspace_index_list, workspace_index_list_size, sizeof(int32_t), cmp);
    for (int32_t workspace_index_locator = 0; workspace_index_locator < workspace_index_list_size; workspace_index_locator++) {
      int32_t j = workspace_index_list[workspace_index_locator];
      int32_t pA2 = A2_pos[i];
      A2_pos[i] = A2_pos[i] + 1;
      A_vals[pA2] = workspace[j];
      workspace_already_set[j] = 0;
    }
  }

  free(workspace_index_list_all);
  free(workspace_already_set_all);
  free(workspace_all);

  for (int32_t p = 0; p < A1_dimension; p++) {
    A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
  }
  A2_pos[0] = 0;

  A->indices[1][0] = (uint8_t*)(A2_pos);
  A->vals = (uint8_t*)A_vals;
  return 0;
}