Fast Sin and Cos on Delphi Embedded ASM
Hello!
There was a need to write a quick calculation of Sin and Cos. The basis for the calculations took the decomposition of the Taylor series. I use it in 3D systems (OpenGL and a graphical library of my own development). Unfortunately, it’s not possible to reduce the series “perfectly” for Double, but this is offset by good acceleration. The code is written in a built-in Delphi XE6 assembler. Used by SSE2.
Not suitable for scientific computing, but quite suitable for use in games.
There is enough accuracy to cover the digit capacity of the Single number, which is used
to multiply by the matrix.
Eventually:
For the test I used an Intel Core-i7 6950X Extreme 3.0 GHz processor.
Delphi source code is embedded in assembler comments.
Source:
Example of work here
Constructive suggestions and comments are welcome.
There was a need to write a quick calculation of Sin and Cos. The basis for the calculations took the decomposition of the Taylor series. I use it in 3D systems (OpenGL and a graphical library of my own development). Unfortunately, it’s not possible to reduce the series “perfectly” for Double, but this is offset by good acceleration. The code is written in a built-in Delphi XE6 assembler. Used by SSE2.
Not suitable for scientific computing, but quite suitable for use in games.
There is enough accuracy to cover the digit capacity of the Single number, which is used
to multiply by the matrix.
Eventually:
- The achieved accuracy of the result is: 10.e-13
- The maximum discrepancy with the CPU is 0.000000000000045.
- The speed is increased in comparison with the CPU by 4.75 times.
- Speed increased in comparison with Math.Sin and Math.Cos by 2.6 times.
For the test I used an Intel Core-i7 6950X Extreme 3.0 GHz processor.
Delphi source code is embedded in assembler comments.
Source:
Spoiler header
var
gSinTab: arrayof Double;
gSinAddr: UInt64;
const
AbsMask64: UInt64 = ($7FFFFFFFFFFFFFFF);
PI90: Double = (PI / 2.0);
FSinTab: array[0..7] of Double =
(1.0 / 6.0, //3!1.0 / 120.0, //5!1.0 / 5040.0, //7!1.0 / 362880.0, //9!1.0 / 39916800.0, //11!1.0 / 6227020800.0, //13!1.0 / 1307674368000.0, //15!1.0 / 355687428096000.0); //17!//Максимумы функции Sin для положительных значений
QSinMaxP: array[0..3] of Double = (0.0, 1.0, 0.0, -1.0);
//Максимумы функции Sin для отрицательных значений
QSinMaxM: array[0..3] of Double = (0.0, -1.0, 0.0, 1.0);
//Знаки квадрантов для положительных значений
QSinSignP: array[0..3] of Double = (1.0, 1.0, -1.0, -1.0);
//Знаки квадрантов для отрицательных значений
QSinSignM: array[0..3] of Double = (-1.0, -1.0, 1.0, 1.0);
functionSinf(const A: Double): Double;
asm
.NOFRAME
// На входе A в xmm0// Четверть окружности// X := IntFMod(Abs(A), PI90, D);// Result := FNum - (Trunc(FNum / FDen) * FDen);
pxor xmm3, xmm3
comisd A, xmm3
jnz @ANZ
ret // Result := 0.0;
@ANZ:
//Флаг отрицательного угла//Minus := (A < 0.0);
setc cl
movsd xmm1, [AbsMask64] //Abs(A)
pand A, xmm1
movsd xmm1, [PI90] //PI90 = FDen
movsd xmm2, A //Копия A = FNum
divsd xmm2, xmm1 //(FNum / FDen)
roundsd xmm2, xmm2, 11b //11b - Trunc
cvtsd2si rax, xmm2 //Целая часть (X в EAX)
mulsd xmm2, xmm1
subsd xmm0, xmm2
//-----------------------------------//Нормализация квадрантаand rax, 3// D := (D and 3);//-----------------------------------// Проверка на максимум функции// if (X = 0.0) then// begin// if Minus then// Exit(QSinMaxM[D]) else// Exit(QSinMaxP[D]);// end;
comisd xmm0, xmm3
jnz @XNZ
shl rax, 3//умножить номер квадранта на размер Double (8 байт)
cmp cl, 1//Если угол отрицательынй, то переход
je @MaxMinus
@MaxPlus:
lea rdx, qword ptr [QSinMaxP]
movsd xmm0, qword ptr [rdx + rax]
ret
@MaxMinus:
lea rdx, qword ptr [QSinMaxM]
movsd xmm0, qword ptr [rdx + rax]
ret
//-----------------------------------
@XNZ:
//При нечетном квадранте нужно вычесть градусы для симметрии// if (D and 1) <> 0 then X := (PI90 - X);
mov edx, eax
and edx, 1
cmp edx, 0
je @DZ
subsd xmm1, xmm0
movsd xmm0, xmm1
//-----------------------------------
@DZ:
// Result остается в xmm0// X в xmm0// N := (X * X) в xmm2// F := (N * X) в xmm1// N
movsd xmm2, xmm0 // Копия X
mulsd xmm2, xmm2 // N := (X * X)// F
movsd xmm1, xmm2 // Копия N
mulsd xmm1, xmm0 // F := (N * X)//Загружаем таблицу факториалов для синуса
mov rdx, [gSinTab]
movaps xmm4, dqword ptr [rdx]
movaps xmm6, dqword ptr [rdx + 16]
movaps xmm8, dqword ptr [rdx + 32]
movaps xmm10, dqword ptr [rdx + 48]
//Извлекаем нечетную часть таблицы
movhlps xmm5, xmm4
movhlps xmm7, xmm6
movhlps xmm9, xmm8
movhlps xmm11, xmm10
// Result := X - F * PDouble(gSinAddr)^;
mulsd xmm4, xmm1 //FSinTab[0]
subsd xmm0, xmm4
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 8)^;
mulsd xmm1, xmm2
mulsd xmm5, xmm1 //FSinTab[1]
addsd xmm0, xmm5
// F := (F * N);// Result := Result - F * PDouble(gSinAddr + 16)^;
mulsd xmm1, xmm2
mulsd xmm6, xmm1 //FSinTab[2]
subsd xmm0, xmm6
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 24)^;
mulsd xmm1, xmm2
mulsd xmm7, xmm1 //FSinTab[3]
addsd xmm0, xmm7
// F := (F * N);// Result := Result - F * PDouble(gSinAddr + 32)^;
mulsd xmm1, xmm2
mulsd xmm8, xmm1 //FSinTab[4]
subsd xmm0, xmm8
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 40)^;
mulsd xmm1, xmm2
mulsd xmm9, xmm1 //FSinTab[5]
addsd xmm0, xmm9
// F := (F * N);// Result := Result - F * PDouble(gSinAddr + 48)^;
mulsd xmm1, xmm2
mulsd xmm10, xmm1 //FSinTab[6]
subsd xmm0, xmm10
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 56)^;
mulsd xmm1, xmm2
mulsd xmm11, xmm1 //FSinTab[7]
addsd xmm0, xmm11
//-----------------------------------// if Minus then// Result := (Result * QSinSignM[D]) else// Result := (Result * QSinSignP[D]);shl rax, 3//Умножаем на 8
cmp cl, 1
je @Minus
lea rdx, qword ptr [QSinSignP]
mulsd xmm0, qword ptr [rdx + rax]
ret
@Minus:
lea rdx, qword ptr [QSinSignM]
mulsd xmm0, qword ptr [rdx + rax]
end;
//------------------------------------functionCosf(const A: Double): Double;
asm
.NOFRAME
// A в xmm0// Четверть окружности// X := IntFMod(AMod, PI90, D);// Result := FNum - (Trunc(FNum / FDen) * FDen);
pxor xmm3, xmm3
comisd A, xmm3
jnz @ANZ
movsd xmm0, [DoubleOne]
ret // Result := 1.0;//-----------------------------------
@ANZ:
//Флаг отрицательного угла//Minus := (A < 0.0);
setc cl
movsd xmm1, [AbsMask64] //Abs(A)
pand A, xmm1
//-----------------------------------
movsd xmm1, [PI90] //PI90 = FDen//-----------------------------------// if Minus then// AMod := Abs(A) - PI90 else// AMod := Abs(A) + PI90;
cmp cl, 1
je @SubPI90
addsd A, xmm1
jmp @IntFMod
@SubPI90:
subsd A, xmm1
//-----------------------------------
@IntFMod:
movsd xmm2, A //Копия A = FNum
divsd xmm2, xmm1 //(FNum / FDen)
roundsd xmm2, xmm2, 11b //11b - Trunc
cvtsd2si rax, xmm2 //Целая часть (X в EAX)
mulsd xmm2, xmm1
subsd xmm0, xmm2
//-----------------------------------//Нормализация квадрантаand rax, 3// D := (D and 3);//-----------------------------------// Проверка на максимум функции// if (X = 0.0) then// begin// if Minus then// Exit(QSinMaxM[D]) else// Exit(QSinMaxP[D]);// end;
comisd xmm0, xmm3
jnz @XNZ
shl rax, 3//умножить номер квадранта на размер Double (8 байт)
cmp cl, 1//Если угол отрицательынй, то переход
je @MaxMinus
@MaxPlus:
lea rdx, qword ptr [QSinMaxP]
movsd xmm0, qword ptr [rdx + rax]
ret
@MaxMinus:
lea rdx, qword ptr [QSinMaxM]
movsd xmm0, qword ptr [rdx + rax]
ret
//-----------------------------------
@XNZ:
//При нечетном квадранте нужно вычесть градусы для симметрии// if (D and 1) <> 0 then X := (PI90 - X);
mov edx, eax
and edx, 1
cmp edx, 0
je @DZ
subsd xmm1, xmm0
movsd xmm0, xmm1
@DZ:
// Result остается в xmm0// X в xmm0// N := (X * X) в xmm2// F := (N * X) в xmm1// N
movsd xmm2, xmm0 // Копия X
mulsd xmm2, xmm2 // N := (X * X)// F
movsd xmm1, xmm2 // Копия N
mulsd xmm1, xmm0 // F := (N * X)//Загружаем таблицу факториалов для синуса
mov rdx, [gSinTab]
movaps xmm4, dqword ptr [rdx]
movaps xmm6, dqword ptr [rdx + 16]
movaps xmm8, dqword ptr [rdx + 32]
movaps xmm10, dqword ptr [rdx + 48]
//Извлекаем нечетную часть таблицы
movhlps xmm5, xmm4
movhlps xmm7, xmm6
movhlps xmm9, xmm8
movhlps xmm11, xmm10
// Result := X - F * PDouble(gSinAddr)^;
mulsd xmm4, xmm1 //FSinTab[0]
subsd xmm0, xmm4
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 8)^;
mulsd xmm1, xmm2
mulsd xmm5, xmm1 //FSinTab[1]
addsd xmm0, xmm5
// F := (F * N);// Result := Result - F * PDouble(gSinAddr + 16)^;
mulsd xmm1, xmm2
mulsd xmm6, xmm1 //FSinTab[2]
subsd xmm0, xmm6
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 24)^;
mulsd xmm1, xmm2
mulsd xmm7, xmm1 //FSinTab[3]
addsd xmm0, xmm7
// F := (F * N);// Result := Result - F * PDouble(gSinAddr + 32)^;
mulsd xmm1, xmm2
mulsd xmm8, xmm1 //FSinTab[4]
subsd xmm0, xmm8
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 40)^;
mulsd xmm1, xmm2
mulsd xmm9, xmm1 //FSinTab[5]
addsd xmm0, xmm9
// F := (F * N);// Result := Result - F * PDouble(gSinAddr + 48)^;
mulsd xmm1, xmm2
mulsd xmm10, xmm1 //FSinTab[6]
subsd xmm0, xmm10
// F := (F * N);// Result := Result + F * PDouble(gSinAddr + 56)^;
mulsd xmm1, xmm2
mulsd xmm11, xmm1 //FSinTab[7]
addsd xmm0, xmm11
//-----------------------------------// if Minus then// Result := (Result * QSinSignM[D]) else// Result := (Result * QSinSignP[D]);shl rax, 3//Умножаем на 8
cmp cl, 1
je @Minus
lea rdx, qword ptr [QSinSignP]
mulsd xmm0, qword ptr [rdx + rax]
ret
@Minus:
lea rdx, qword ptr [QSinSignM]
mulsd xmm0, qword ptr [rdx + rax]
end;
Initialization//Выровненный массив
SetLength(gSinTab, 8);
//Адрес таблицы
gSinAddr := UInt64(@FSinTab[0]);
//Копируем таблицу в выровненный массив
Move(FSinTab[0], gSinTab[0], SizeOf(Double) * 8);
Finalization
SetLength(gSinTab, 0);
Example of work here
Constructive suggestions and comments are welcome.