Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as C by gh ( 16 years ago )
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <xmmintrin.h>
const int N = 0x100;
const long int M = 0xa;
const double BILLION = 1000000000.0;
double accum;
struct timespec start, stop;
void time_start() {
clock_gettime(CLOCK_REALTIME, &start;);
}
void time_stop() {
clock_gettime(CLOCK_REALTIME, &stop;);
}
double get_interval() {
return (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / BILLION;
}
void print(float a[N + N*N]) {
int i;
int j;
printf("\n");
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(".6f", a[i + N*j]);
}
printf("\n");
}
}
void transpose(float *a, float *b) {
int i;
int j;
for (j = 0; j < N; j++) {
for (i = 0; i < N; i++) {
a[i + N*j] = b[j + N*j];
}
}
}
void matrix_product(float *a, float *b, float *c) {
__m128 *d;
__m128 *e = (__m128 *)b;
__m128 *g = (__m128 *)c;
int i;
int j;
int k;
float *f = (float *)_mm_malloc(N * N * sizeof(float), 16);
d = (__m128 *)f;
transpose(a, f);
for (i = 0; i < N; i++) {
for (k = 0; k < N/4; k++) {
for (j = 0; j < N/4; j++) {
g[i + N*k] = _mm_mul_ps(d[i + N*j], e[i + N*j]);
}
}
}
_mm_free(f);
}
int main() {
float *a = (float *)_mm_malloc(N * N * sizeof(float), 16);
float *b = (float *)_mm_malloc(N * N * sizeof(float), 16);
float *r = (float *)_mm_malloc(N * N * sizeof(float), 16);
float *c1 = (float *)_mm_malloc(N * N * sizeof(float), 16);
float *c2 = (float *)_mm_malloc(N * N * sizeof(float), 16);
float s[N];
float row;
float col;
float *r1;
float *r2;
float *r3;
int i;
int j;
int k;
srand(time(NULL));
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
a[i + N*j] = rand() % 11 - 5;
}
}
row = 0;
for (j = 0; j < N; j++) {
row += a[0 + N*j];
}
memset(s, 0, sizeof(float) * N);
for (i = 0; i < N; i++) {
float sum = 0;
for (j = 0; j < N; j++) {
sum += a[i + N*j];
s[j] += a[i + N*j];
}
if (sum > row) {
row = sum;
}
}
col = s[0];
for (i = 1; i < N; i++) {
if (col < s[i]) {
col = s[i];
}
}
row = 1.0 / row / col;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
b[j + N*i] = a[i + N*j] * row;
}
}
memset(r, 0, sizeof(float) * N * N);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
r[i + N*k] -= b[i + N*j] * a[j + N*k];
}
}
r[i + N*i]++;
}
memset(c2, 0, sizeof(float) * N * N);
for (i = 0; i < N; i++) {
c2[i + N*i] = 1;
}
r1 = c1;
r2 = c2;
time_start();
for (i = 0; i < M; i++) {
memset(r1, 0, sizeof(float) * N * N);
matrix_product(r2, r, r1);
for (j = 0; j < N; j++) {
r1[j + N*j]++;
}
r3 = r1;
r1 = r2;
r2 = r3;
}
memset(r2, 0, sizeof(float) * N * N);
matrix_product(r1, b, r2);
time_stop();
accum += get_interval() / (M + 1);
printf("%lfs\n", accum);
_mm_free(a);
_mm_free(b);
_mm_free(r);
_mm_free(c1);
_mm_free(c2);
return 0;
}
Revise this Paste