Search

2012年10月24日 星期三

ええええ?!

    繼上次改寫舊的繪圖引擎把矩陣運算等等的改用SIMD方式去寫之後,很好奇MSDN上寫的_mm_set_ps跟_mm_load_ps除了順序相反以外的差別,我以為_mm_set_ps直接從data段設定__m128之類型態的變數數值,不過看了一下編譯後的樣子會先設定xmm0~4低32位元然後用UNPCKLPS慢慢移動到xmm0最後才把xmm0的值移動到__m128之類型態的變數,而且沒開-O2跟有開在設定xmm0~4時順序會相反不過我看不出除了相同數值共用同個data段設定外這樣有優化什麼?而用_mm_load_ps就直接MOVAPS整個16 bytes的資料到xmm0再MOVAPS到__m128之類型態的變數,總之我覺得SSE2效果不錯也好用。


__declspec(align(16)) float data[4] = {1.13f,1.22f,1.31f,1.0f};    //Data Segment
__m128 m0 = _mm_set_ps(1.0f,1.22f,1.33f,1.0f);
__m128 m1 = _mm_load_ps(data);


==============================================================
Release /Od

__m128 m0 = _mm_set_ps(1.0f,1.22f,1.33f,1.0f);

MOVSS XMM0,DWORD PTR DS:[402178]         ; FLOAT 1.000000
MOVSS XMM1,DWORD PTR DS:[402174]         ; FLOAT 1.330000
MOVSS XMM2,DWORD PTR DS:[402170]         ; FLOAT 1.220000
MOVSS XMM3,DWORD PTR DS:[402178]         ; FLOAT 1.000000
UNPCKLPS XMM0,XMM2
UNPCKLPS XMM1,XMM3
UNPCKLPS XMM0,XMM1
MOVAPS DQWORD PTR SS:[LOCAL.6],XMM0
MOVAPS XMM0,DQWORD PTR SS:[LOCAL.6]
MOVAPS DQWORD PTR SS:[LOCAL.10],XMM0

__m128 m1 = _mm_load_ps(data);

MOVAPS XMM0,DQWORD PTR DS:[403030]
MOVAPS DQWORD PTR SS:[LOCAL.14],XMM0
MOVAPS XMM0,DQWORD PTR SS:[LOCAL.14]
MOVAPS DQWORD PTR SS:[LOCAL.18],XMM0

==============================================================

Release /O2

__m128 m0 = _mm_set_ps(1.0f,1.22f,1.33f,1.0f);

MOVSS XMM0,DWORD PTR DS:[402140]         ; FLOAT 1.000000
MOVSS XMM2,DWORD PTR DS:[40213C]         ; FLOAT 1.220000
MOVSS XMM3,DWORD PTR DS:[402138]         ; FLOAT 1.330000
MOVSS XMM1,XMM0
UNPCKLPS XMM0,XMM2
UNPCKLPS XMM3,XMM1
UNPCKLPS XMM0,XMM3
MOVAPS DQWORD PTR SS:[LOCAL.8],XMM0

__m128 m1 = _mm_load_ps(data);

MOVAPS XMM0,DQWORD PTR DS:[403030]
MOVAPS DQWORD PTR SS:[LOCAL.4],XMM0

沒有留言:

張貼留言