Background
Intel 公司在 X86 指令集上提供了 AVX (Advanced Vector Extensions)、SSE (Streaming SIMD Extensions) 的特殊指令,這些在 SIMD 指令上提供強力的加速。
Problem
現在給你長度為 $N$ 的 32-bit floating point,分別將其開根號回傳。
- 你可能會需要函數庫
#include <x86intrin.h>
- 你可能會需要 AVX/SSE 開根指令
__m256 _mm256_sqrt_ps(__m256 a);
main.c
12345678910111213141516171819202122232425262728 #include <stdio.h>
#include <memory.h>
#include <math.h>
#include <time.h>
#include "VSQRT.h"
static
float
a[1048576], b[1048576];
int
main() {
int
N = 1048576;
for
(
int
i = 0; i < N; ++i)
a[i] = i;
for
(
int
it = 0; it < 20; it++)
{
memcpy
(b, a,
sizeof
(a[0])*N);
clock_t
t =
clock
();
for
(
int
i = 0; i < 10; i++)
sqrt2(b, b+N);
t =
clock
() - t;
fprintf
(stderr,
"It took me %f seconds.\n"
, ((
float
) t)/CLOCKS_PER_SEC);
float
sum = 0;
for
(
int
i = 0; i < N; i++)
sum += b[i];
printf
(
"%f\n"
, sum);
}
return
0;
}
VSQRT.h
123456 #ifndef __VSQRT_H
#define __VSQRT_H
void
sqrt2(
float
*begin,
float
*end);
#endif
VSQRT.c
請嘗試加速以下程式碼。
12345678 #include "VSQRT.h"
#include <math.h>
void
sqrt2(
float
*begin,
float
*end) {
for
(; begin != end; begin++)
*begin =
sqrt
(*begin);
}
Sample Input (stdin)
no input
Sample Output (stdout)
1234567891011121314151617181920 1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
1051776.125000
Makefile
all: main
main: main.c VSQRT.o
gcc -Wall -std=c99 -O3 main.c VSQRT.o -o main -lm
VSQRT.o: VSQRT.c
gcc -Wall -std=c99 -O3 -msse -mavx VSQRT.c -c -o VSQRT.o -lm
test:
time
-p ./main
Note
第 0 組測資使用範例輸入,其餘測資採用另外的主程式運行。