속도가 느린 sqrt를 다양한 알고리즘으로 구현하여 테스트해 보았습니다.
1) ASMsqrt : FPU를 이용한 어셈코드
__forceinline float ASMsqrt(float src) // FPU { float t; _asm { fld src; fsqrt; fstp t; } return t; } |
2) fSQRT : GPG 책에 나오는 알고르즘
#define SQRTTABLESIZE 256 /* Note: code below assumes this is 256. */ unsigned int ZSQRTTABLE[SQRTTABLESIZE] = { 531980127, 532026288, 532072271, 532118079, 532163712, 532209174, 532254465, 532299589, 532344546, 532389339, 532433970, 532478440, 532522750, 532566903, 532610900, 532654744, 532698434, 532741974, 532785365, 532828607, 532871704, 532914655, 532957463, 533000129, 533042654, 533085041, 533127289, 533169401, 533211378, 533253220, 533294931, 533336509, 533377958, 533419278, 533460470, 533501535, 533542475, 533583291, 533623984, 533664554, 533705004, 533745334, 533785545, 533825638, 533865615, 533905476, 533945222, 533984855, 534024374, 534063782, 534103079, 534142267, 534181345, 534220315, 534259178, 534297934, 534336585, 534375132, 534413574, 534451914, 534490152, 534528288, 534566324, 534604260, 534642098, 534679837, 534717478, 534755023, 534792473, 534829827, 534867086, 534904252, 534941325, 534978305, 535015194, 535051992, 535088699, 535125317, 535161846, 535198287, 535234640, 535270905, 535307085, 535343178, 535379187, 535415110, 535450950, 535486706, 535522379, 535557970, 535593480, 535628908, 535664255, 535699523, 535734711, 535769820, 535804850, 535839803, 535874678, 535909476, 535944198, 535978844, 536013414, 536047910, 536082331, 536116678, 536150952, 536185153, 536219281, 536253337, 536287322, 536321235, 536355078, 536388850, 536422553, 536456186, 536489750, 536523246, 536556673, 536590033, 536623325, 536656551, 536689709, 536722802, 536755829, 536788791, 536821688, 536854520, 536887280, 536919921, 536952436, 536984827, 537017094, 537049241, 537081267, 537113174, 537144963, 537176637, 537208195, 537239640, 537270972, 537302193, 537333304, 537364306, 537395200, 537425987, 537456669, 537487246, 537517720, 537548091, 537578361, 537608530, 537638600, 537668572, 537698446, 537728224, 537757906, 537787493, 537816986, 537846387, 537875696, 537904913, 537934040, 537963078, 537992027, 538020888, 538049662, 538078350, 538106952, 538135470, 538163903, 538192254, 538220521, 538248707, 538276812, 538304837, 538332781, 538360647, 538388434, 538416144, 538443776, 538471332, 538498812, 538526217, 538553548, 538580804, 538607987, 538635097, 538662136, 538689102, 538715997, 538742822, 538769577, 538796263, 538822880, 538849428, 538875909, 538902322, 538928668, 538954949, 538981163, 539007312, 539033396, 539059416, 539085373, 539111265, 539137095, 539162863, 539188568, 539214212, 539239794, 539265316, 539290778, 539316180, 539341522, 539366806, 539392031, 539417197, 539442306, 539467358, 539492352, 539517290, 539542171, 539566997, 539591768, 539616483, 539641143, 539665749, 539690301, 539714800, 539739245, 539763637, 539787976, 539812264, 539836499, 539860682, 539884815, 539908896, 539932927, 539956907, 539980838, 540004718, 540028549, 540052332, 540076065, 540099750, 540123387, 540146976, 540170517, 540194011, 540217458, 540240858, 540264211, 540287519, 540310780, 540333996 }; __forceinline float fSQRT(float f) { INTORFLOAT fi; unsigned int e, n; /* Get raw bits of floating point f. */ fi.f = f; n = fi.i; /* Divide exponent by 2 -- this is done in-place, no need to shift all the way down to 0 the least significant bits and then back up again. Note that we are also dividing the exponent bias (127) by 2, this gets corrected when we add in the sqrttable entry. */ e = (n >> 1) & 0x3f800000; /* n is the table index -- we're using 1 bit from the original exponent (e0) plus 7 bits from the mantissa. */ n = (n >> 16) & 0xff; /* Add calculated exponent to mantissa + re-biasing constant from table. */ fi.i = ZSQRTTABLE[n] + e; /* Return floating point result. */ return fi.f; } |
3) FastSqrt : Dave Eberly의 FastInverseSqrt의 알고리즘
__forceinline float FastInverseSqrt(float fValue) { float fHalf = 0.5f * fValue; int i = *(int*)&fValue; i = 0x5f3759df - (i >> 1); fValue = *(float*)&i; fValue = fValue*(1.5f - fHalf*fValue*fValue); return fValue; } __forceinline float FastSqrt(float f) { return FastInverseSqrt(f) * f; } |
4) sqrt : c에 있는 일반 함수
5) irrSqrt : irrlicht에 나오는 알고리즘
typedef unsigned int u32; typedef float f32; #define IEEE_1_0 0x3f800000 __forceinline f32 reciprocal_squareroot(const f32 x) { u32 tmp = (u32(IEEE_1_0 << 1) + IEEE_1_0 - *(u32*)&x) >> 1; f32 y = *(f32*)&tmp; return 1.f / (y * (1.47f - 0.47f * x * y * y)); } |
6) sseSqrt : SSE를 이용한 어셈코드
__forceinline f32 SSE2sqrt(const f32 x) { // an sse2 version __asm { movss xmm0, x rsqrtss xmm0, xmm0 movss x, xmm0 } return 1.0f / x; } |
결과를 보시면 아시겠지만 속도는 fSQRT가 가장 빠릅니다. 그러나 정확도는 가장 떨어집니다. 즉, 정확도가 아주 중요하지 않는 곳에서는 fSQRT를 사용하시는 것이 좋습니다. 정확도가 중요시되는 곳에서는 ASMsqrt를 사용하세요.
빠른 속도 순서 : fSQRT > FastSqrt > ASMsqrt = sseSqrt = irrSqrt > sqrt
정확도 순서 : sqrt = ASMsqrt > sseSqrt > irrSqrt > FastSqrt > fSQRT
다만, 알고리즘적으로 sqrt를 구현해야 할 때(플랫폼이 PC가 아닌..)에는 FastSqrt나 irrSqrt 는 유용하겠네요. 그럼 이도 저도 아닌 sseSqrt 코드는 버려야...
주의
코드를 가지고 최적화테스트하실 때는 Release상태에서 최적화옵션은 다 끄되, 인라인은 켜 놓으셔야 올바른 결과가 도출됩니다.
'자료구조' 카테고리의 다른 글
검색 알고리즘 (0) | 2013.06.07 |
---|---|
알고리즘 관계복잡도 (0) | 2013.05.14 |
해시테이블 (0) | 2013.04.25 |
이진검색트리 (0) | 2013.04.25 |