void DSPF_dp_mat_mul_gemm(double *x1, double const a, const int r1, const int c1,
double *x2, const int c2, double *restrict y)
{
int i, j, k, xoff1, xoff2;
double sum0, sum1, sum2, sum3;
double x00, x01, y00, y01, y10, y11, x10, x11;
double *ptr_x, *ptr_y, *restrict y1, *restrict y2;
_nassert(r1 > 0);
_nassert(c1 > 0);
_nassert(c2 > 0);
_nassert((int)x1 % 8 == 0);
_nassert((int)x2 % 8 == 0);
_nassert((int)y % 8 == 0);
_nassert(c1 % 2 == 0 );
_nassert(r1 % 2 == 0 );
_nassert(c2 % 2 == 0 );
#pragma MUST_ITERATE(1,,)
for (j = 0; j < c2; j+=2) {
xoff2 = j * c1;
y1 = &y[(j + 0) * r1];
y2 = &y[(j + 1) * r1];
#pragma MUST_ITERATE(1,,)
for (i = 0; i < r1; i+=2) {
xoff1 = i * c1;
sum0 = 0;
sum1 = 0;
sum2 = 0;
sum3 = 0;
ptr_x = &x1[xoff1];
ptr_y = &x2[xoff2];
#pragma MUST_ITERATE(1,,)
for (k = 0; k < c1; k+=2,ptr_x+=2,ptr_y+=2) {
x00 = ptr_x[0];
x01 = ptr_x[1];
x10 = ptr_x[c1];
x11 = ptr_x[c1 + 1];
y00 = ptr_y[0];
y01 = ptr_y[c1];
y10 = ptr_y[1];
y11 = ptr_y[c1 + 1];
sum0 += x00 * y00 + x01 * y10;
sum1 += x00 * y01 + x01 * y11;
sum2 += x10 * y00 + x11 * y10;
sum3 += x10 * y01 + x11 * y11;
}
y1[(i + 0)] += a*sum0;
y2[(i + 0)] += a*sum1;
y1[(i + 1)] += a*sum2;
y2[(i + 1)] += a*sum3;
}
}
}void DSPF_dp_mat_mul_gemm_test(double *x1,double const a, const int r1,
const int c1, double *x2, const int c2,
double *restrict y) {
int i, j, k, xoff1, xoff2;
double sum0, sum1, sum2, sum3;
double x00, x01, y00, y01, y10, y11, x10, x11;
double *ptr_x, *ptr_y, *y1, *y2;
_nassert(r1 > 0);
_nassert(c1 > 0);
_nassert(c2 > 0);
_nassert((int)x1 % 8 == 0);
_nassert((int)x2 % 8 == 0);
_nassert((int)y % 8 == 0);
_nassert(c1 % 2 == 0 );
_nassert(r1 % 2 == 0 );
_nassert(c2 % 2 == 0 );
#pragma MUST_ITERATE(1,,)
for (j = 0; j < c2; j+=2) {
xoff2 = j;
#pragma MUST_ITERATE(1,,)
for (i = 0; i < r1; i+=2) {
xoff1 = i * c1;
y1 =&y[j+i*c2];
y2 =&y[j+c2+i*c2];
sum0 = 0;
sum1 = 0;
sum2 = 0;
sum3 = 0;
ptr_x = &x1[xoff1];
ptr_y = &x2[xoff2];
#pragma MUST_ITERATE(1,,)
for (k = 0; k < c1; k+=2,ptr_x+=2,ptr_y+=c2*2) {
x00 = ptr_x[0];
x01 = ptr_x[1];
x10 = ptr_x[c1];
x11 = ptr_x[c1 + 1];
y00 = ptr_y[0];
y01 = ptr_y[c2];
y10 = ptr_y[1];
y11 = ptr_y[c2 + 1];
sum0 += x00 * y00 + x01 * y01;
sum1 += x00 * y10 + x01 * y11;
sum2 += x10 * y00 + x11 * y01;
sum3 += x10 * y10 + x11 * y11;
}
y1[0] += a*sum0;
y2[0] += a*sum2;
y1[1] += a*sum1;
y2[1] += a*sum3;
}
}
}