Fast Sin and Cos on Delphi Embedded ASM

Hello!

There was a need to write a quick calculation of Sin and Cos. The basis for the calculations took the decomposition of the Taylor series. I use it in 3D systems (OpenGL and a graphical library of my own development). Unfortunately, it’s not possible to reduce the series “perfectly” for Double, but this is offset by good acceleration. The code is written in a built-in Delphi XE6 assembler. Used by SSE2.

Not suitable for scientific computing, but quite suitable for use in games.
There is enough accuracy to cover the digit capacity of the Single number, which is used
to multiply by the matrix.

Eventually:

  1. The achieved accuracy of the result is: 10.e-13
  2. The maximum discrepancy with the CPU is 0.000000000000045.
  3. The speed is increased in comparison with the CPU by 4.75 times.
  4. Speed ​​increased in comparison with Math.Sin and Math.Cos by 2.6 times.

For the test I used an Intel Core-i7 6950X Extreme 3.0 GHz processor.
Delphi source code is embedded in assembler comments.

Source:

Spoiler header
var
  gSinTab: arrayof Double;
  gSinAddr: UInt64;
const
  AbsMask64: UInt64 = ($7FFFFFFFFFFFFFFF);
  PI90: Double = (PI / 2.0);
  FSinTab: array[0..7] of Double =
  (1.0 / 6.0, //3!1.0 / 120.0, //5!1.0 / 5040.0, //7!1.0 / 362880.0, //9!1.0 / 39916800.0, //11!1.0 / 6227020800.0, //13!1.0 / 1307674368000.0, //15!1.0 / 355687428096000.0); //17!//Максимумы функции Sin для положительных значений
  QSinMaxP: array[0..3] of Double = (0.0, 1.0, 0.0, -1.0); 
  //Максимумы функции Sin для отрицательных значений
  QSinMaxM: array[0..3] of Double = (0.0, -1.0, 0.0, 1.0); 
  //Знаки квадрантов для положительных значений  
  QSinSignP: array[0..3] of Double = (1.0, 1.0, -1.0, -1.0); 
  //Знаки квадрантов для отрицательных значений
  QSinSignM: array[0..3] of Double = (-1.0, -1.0, 1.0, 1.0); 
functionSinf(const A: Double): Double;
asm
  .NOFRAME
  // На входе A в xmm0// Четверть окружности// X := IntFMod(Abs(A), PI90, D);// Result := FNum - (Trunc(FNum / FDen) * FDen);
  pxor xmm3, xmm3
  comisd A, xmm3
  jnz @ANZ
  ret // Result := 0.0;
 @ANZ:
  //Флаг отрицательного угла//Minus := (A < 0.0);
  setc cl
  movsd xmm1, [AbsMask64] //Abs(A)
  pand A, xmm1
  movsd xmm1, [PI90] //PI90 = FDen
  movsd xmm2, A //Копия A = FNum
  divsd xmm2, xmm1 //(FNum / FDen)
  roundsd xmm2, xmm2, 11b //11b - Trunc
  cvtsd2si rax, xmm2 //Целая часть (X в EAX)
  mulsd xmm2, xmm1
  subsd xmm0, xmm2
  //-----------------------------------//Нормализация квадрантаand rax, 3// D := (D and 3);//-----------------------------------// Проверка на максимум функции// if (X = 0.0) then// begin//   if Minus then//     Exit(QSinMaxM[D]) else//     Exit(QSinMaxP[D]);// end;
  comisd xmm0, xmm3
  jnz @XNZ
  shl rax, 3//умножить номер квадранта на размер Double (8 байт)
  cmp cl, 1//Если угол отрицательынй, то переход
  je @MaxMinus
 @MaxPlus:
  lea rdx, qword ptr [QSinMaxP]
  movsd xmm0, qword ptr [rdx + rax]
  ret
 @MaxMinus:
  lea rdx, qword ptr [QSinMaxM]
  movsd xmm0, qword ptr [rdx + rax]
  ret
  //-----------------------------------
 @XNZ:
  //При нечетном квадранте нужно вычесть градусы для симметрии// if (D and 1) <> 0 then X := (PI90 - X);
  mov edx, eax
  and edx, 1
  cmp edx, 0
  je @DZ
  subsd xmm1, xmm0
  movsd xmm0, xmm1
  //-----------------------------------
 @DZ:
  // Result остается в xmm0// X в xmm0// N := (X * X) в xmm2// F := (N * X) в xmm1// N
  movsd xmm2, xmm0 // Копия X
  mulsd xmm2, xmm2 // N := (X * X)// F
  movsd xmm1, xmm2 // Копия N
  mulsd xmm1, xmm0 // F := (N * X)//Загружаем таблицу факториалов для синуса
  mov rdx, [gSinTab]
  movaps xmm4, dqword ptr [rdx]
  movaps xmm6, dqword ptr [rdx + 16]
  movaps xmm8, dqword ptr [rdx + 32]
  movaps xmm10, dqword ptr [rdx + 48]
  //Извлекаем нечетную часть таблицы
  movhlps xmm5, xmm4
  movhlps xmm7, xmm6
  movhlps xmm9, xmm8
  movhlps xmm11, xmm10
  // Result := X - F * PDouble(gSinAddr)^;
  mulsd xmm4, xmm1 //FSinTab[0]
  subsd xmm0, xmm4
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 8)^;
  mulsd xmm1, xmm2
  mulsd xmm5, xmm1 //FSinTab[1]
  addsd xmm0, xmm5
  // F := (F * N);// Result := Result - F * PDouble(gSinAddr + 16)^;
  mulsd xmm1, xmm2
  mulsd xmm6, xmm1 //FSinTab[2]
  subsd xmm0, xmm6
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 24)^;
  mulsd xmm1, xmm2
  mulsd xmm7, xmm1 //FSinTab[3]
  addsd xmm0, xmm7
  // F := (F * N);// Result := Result - F * PDouble(gSinAddr + 32)^;
  mulsd xmm1, xmm2
  mulsd xmm8, xmm1 //FSinTab[4]
  subsd xmm0, xmm8
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 40)^;
  mulsd xmm1, xmm2
  mulsd xmm9, xmm1 //FSinTab[5]
  addsd xmm0, xmm9
  // F := (F * N);// Result := Result - F * PDouble(gSinAddr + 48)^;
  mulsd xmm1, xmm2
  mulsd xmm10, xmm1 //FSinTab[6]
  subsd xmm0, xmm10
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 56)^;
  mulsd xmm1, xmm2
  mulsd xmm11, xmm1 //FSinTab[7]
  addsd xmm0, xmm11
  //-----------------------------------// if Minus then//   Result := (Result * QSinSignM[D]) else//   Result := (Result * QSinSignP[D]);shl rax, 3//Умножаем на 8
  cmp cl, 1
  je @Minus
  lea rdx, qword ptr [QSinSignP]
  mulsd xmm0, qword ptr [rdx + rax]
  ret
 @Minus:
  lea rdx, qword ptr [QSinSignM]
  mulsd xmm0, qword ptr [rdx + rax]
end;
//------------------------------------functionCosf(const A: Double): Double;
asm
  .NOFRAME
  // A в xmm0// Четверть окружности// X := IntFMod(AMod, PI90, D);// Result := FNum - (Trunc(FNum / FDen) * FDen);
  pxor xmm3, xmm3
  comisd A, xmm3
  jnz @ANZ
  movsd xmm0, [DoubleOne]
  ret // Result := 1.0;//-----------------------------------
 @ANZ:
  //Флаг отрицательного угла//Minus := (A < 0.0);
  setc cl
  movsd xmm1, [AbsMask64] //Abs(A)
  pand A, xmm1
  //-----------------------------------
  movsd xmm1, [PI90] //PI90 = FDen//-----------------------------------// if Minus then//   AMod := Abs(A) - PI90 else//   AMod := Abs(A) + PI90;
  cmp cl, 1
  je @SubPI90
  addsd A, xmm1
  jmp @IntFMod
 @SubPI90:
  subsd A, xmm1
  //-----------------------------------
 @IntFMod:
  movsd xmm2, A //Копия A = FNum
  divsd xmm2, xmm1 //(FNum / FDen)
  roundsd xmm2, xmm2, 11b //11b - Trunc
  cvtsd2si rax, xmm2 //Целая часть (X в EAX)
  mulsd xmm2, xmm1
  subsd xmm0, xmm2
  //-----------------------------------//Нормализация квадрантаand rax, 3// D := (D and 3);//-----------------------------------// Проверка на максимум функции// if (X = 0.0) then// begin//   if Minus then//     Exit(QSinMaxM[D]) else//     Exit(QSinMaxP[D]);// end;
  comisd xmm0, xmm3
  jnz @XNZ
  shl rax, 3//умножить номер квадранта на размер Double (8 байт)
  cmp cl, 1//Если угол отрицательынй, то переход
  je @MaxMinus
 @MaxPlus:
  lea rdx, qword ptr [QSinMaxP]
  movsd xmm0, qword ptr [rdx + rax]
  ret
 @MaxMinus:
  lea rdx, qword ptr [QSinMaxM]
  movsd xmm0, qword ptr [rdx + rax]
  ret
  //-----------------------------------
 @XNZ:
  //При нечетном квадранте нужно вычесть градусы для симметрии// if (D and 1) <> 0 then X := (PI90 - X);
  mov edx, eax
  and edx, 1
  cmp edx, 0
  je @DZ
  subsd xmm1, xmm0
  movsd xmm0, xmm1
 @DZ:
  // Result остается в xmm0// X в xmm0// N := (X * X) в xmm2// F := (N * X) в xmm1// N
  movsd xmm2, xmm0 // Копия X
  mulsd xmm2, xmm2 // N := (X * X)// F
  movsd xmm1, xmm2 // Копия N
  mulsd xmm1, xmm0 // F := (N * X)//Загружаем таблицу факториалов для синуса
  mov rdx, [gSinTab]
  movaps xmm4, dqword ptr [rdx]
  movaps xmm6, dqword ptr [rdx + 16]
  movaps xmm8, dqword ptr [rdx + 32]
  movaps xmm10, dqword ptr [rdx + 48]
  //Извлекаем нечетную часть таблицы
  movhlps xmm5, xmm4
  movhlps xmm7, xmm6
  movhlps xmm9, xmm8
  movhlps xmm11, xmm10
  // Result := X - F * PDouble(gSinAddr)^;
  mulsd xmm4, xmm1 //FSinTab[0]
  subsd xmm0, xmm4
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 8)^;
  mulsd xmm1, xmm2
  mulsd xmm5, xmm1 //FSinTab[1]
  addsd xmm0, xmm5
  // F := (F * N);// Result := Result - F * PDouble(gSinAddr + 16)^;
  mulsd xmm1, xmm2
  mulsd xmm6, xmm1 //FSinTab[2]
  subsd xmm0, xmm6
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 24)^;
  mulsd xmm1, xmm2
  mulsd xmm7, xmm1 //FSinTab[3]
  addsd xmm0, xmm7
  // F := (F * N);// Result := Result - F * PDouble(gSinAddr + 32)^;
  mulsd xmm1, xmm2
  mulsd xmm8, xmm1 //FSinTab[4]
  subsd xmm0, xmm8
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 40)^;
  mulsd xmm1, xmm2
  mulsd xmm9, xmm1 //FSinTab[5]
  addsd xmm0, xmm9
  // F := (F * N);// Result := Result - F * PDouble(gSinAddr + 48)^;
  mulsd xmm1, xmm2
  mulsd xmm10, xmm1 //FSinTab[6]
  subsd xmm0, xmm10
  // F := (F * N);// Result := Result + F * PDouble(gSinAddr + 56)^;
  mulsd xmm1, xmm2
  mulsd xmm11, xmm1 //FSinTab[7]
  addsd xmm0, xmm11
  //-----------------------------------// if Minus then//   Result := (Result * QSinSignM[D]) else//   Result := (Result * QSinSignP[D]);shl rax, 3//Умножаем на 8
  cmp cl, 1
  je @Minus
  lea rdx, qword ptr [QSinSignP]
  mulsd xmm0, qword ptr [rdx + rax]
  ret
 @Minus:
  lea rdx, qword ptr [QSinSignM]
  mulsd xmm0, qword ptr [rdx + rax]
end;
Initialization//Выровненный массив
  SetLength(gSinTab, 8);
  //Адрес таблицы
  gSinAddr := UInt64(@FSinTab[0]);
  //Копируем таблицу в выровненный массив
  Move(FSinTab[0], gSinTab[0], SizeOf(Double) * 8);
Finalization
  SetLength(gSinTab, 0);


Example of work here

Constructive suggestions and comments are welcome.

Also popular now: