问题
I am using a hobby program to teach myself high performance computing techniques.
My PC has an Intel Ivy Bridge Core i7 3770 processor with 32 GB of memory and the free version of Microsoft vs2010 C compiler.
The 64-bit program needs about 20 GB of memory because it has five 4 GB lookup tables (bytevecM ... bytevecX below). The inner loop of this search program was written as a separate C file (since I may want to replace it later with an assembler version), shown below:
#define H_PRIME 1000003
int inner(
const char* bytevecM,
const char* bytevecD,
const char* bytevecC,
const char* bytevecL,
const char* bytevecX,
int startval, int endval,
int m2, int d2, int c2, int l2, int x2,
int* ps
)
{
int* psin = ps;
int qqm;
int m3, m4, m5, m6, m7;
int d3, d4, d5, d6, d7;
int c3, c4, c5, c6, c7;
int l3, l4, l5, l6, l7;
int x3, x4, x5, x6, x7;
int q3, q4, q5, q6, q7, q8;
for (q3 = startval; q3 < endval; ++q3)
{
if (q3 == 10 || q3 == 13) continue;
m3 = (m2 ^ q3) * H_PRIME;
d3 = (d2 ^ q3) * H_PRIME;
c3 = (c2 ^ q3) * H_PRIME;
l3 = (l2 ^ q3) * H_PRIME;
x3 = (x2 ^ q3) * H_PRIME;
for (q4 = 1; q4 < 128; ++q4)
{
if (q4 == 10 || q4 == 13) continue;
m4 = (m3 ^ q4) * H_PRIME;
d4 = (d3 ^ q4) * H_PRIME;
c4 = (c3 ^ q4) * H_PRIME;
l4 = (l3 ^ q4) * H_PRIME;
x4 = (x3 ^ q4) * H_PRIME;
for (q5 = 1; q5 < 128; ++q5)
{
if (q5 == 10 || q5 == 13) continue;
m5 = (m4 ^ q5) * H_PRIME;
d5 = (d4 ^ q5) * H_PRIME;
c5 = (c4 ^ q5) * H_PRIME;
l5 = (l4 ^ q5) * H_PRIME;
x5 = (x4 ^ q5) * H_PRIME;
for (q6 = 1; q6 < 128; ++q6)
{
if (q6 == 10 || q6 == 13) continue;
m6 = (m5 ^ q6) * H_PRIME;
d6 = (d5 ^ q6) * H_PRIME;
c6 = (c5 ^ q6) * H_PRIME;
l6 = (l5 ^ q6) * H_PRIME;
x6 = (x5 ^ q6) * H_PRIME;
for (q7 = 1; q7 < 128; ++q7)
{
if (q7 == 10 || q7 == 13) continue;
m7 = (m6 ^ q7) * H_PRIME;
d7 = (d6 ^ q7) * H_PRIME;
c7 = (c6 ^ q7) * H_PRIME;
l7 = (l6 ^ q7) * H_PRIME;
x7 = (x6 ^ q7) * H_PRIME;
for (q8 = 1; q8 < 128; ++q8)
{
if (q8 == 10 || q8 == 13) continue;
qqm = bytevecM[(unsigned int)(m7 ^ q8)];
if (qqm != 0
&& qqm == bytevecD[(unsigned int)(d7 ^ q8)]
&& qqm == bytevecC[(unsigned int)(c7 ^ q8)]
&& qqm == bytevecL[(unsigned int)(l7 ^ q8)]
&& qqm == bytevecX[(unsigned int)(x7 ^ q8)])
{
*ps++ = q3; *ps++ = q4; *ps++ = q5;
*ps++ = q6; *ps++ = q7; *ps++ = q8;
*ps++ = qqm;
}
}
}
}
}
}
}
return (int)(ps - psin);
}
Note, by the way, that the above algorithm is easily parallelizable by running one copy of it in each thread with different start and end ranges.
Using intuition, Intel intrinsics, and benchmarking each change individually, I was able to reduce the running time from around five hours down to three, as shown below:
#include <emmintrin.h>
#include <smmintrin.h>
#define H_PRIME 1000003
#define UNROLL(q8) qqm = bytevecM[(unsigned int)(m7 ^ q8)]; \
if (qqm != 0 \
&& qqm == bytevecD[(unsigned int)(s7.m128i_i32[0] ^ q8)] \
&& qqm == bytevecC[(unsigned int)(s7.m128i_i32[1] ^ q8)] \
&& qqm == bytevecL[(unsigned int)(s7.m128i_i32[2] ^ q8)] \
&& qqm == bytevecX[(unsigned int)(s7.m128i_i32[3] ^ q8)]) { \
ps[j++] = _mm_set_epi16(0, qqm, q8, q7, q6, q5, q4, q3); }
int inner(
const char* bytevecM,
const char* bytevecD,
const char* bytevecC,
const char* bytevecL,
const char* bytevecX,
int startval, int endval,
int m2, int d2, int c2, int l2, int x2,
__m128i* ps
)
{
__m128i s2 = _mm_set_epi32(x2, l2, c2, d2);
__m128i hp = _mm_set1_epi32(H_PRIME);
__m128i xt[128];
__m128i s3, s4, s5, s6, s7;
int qqm;
int m3, m4, m5, m6, m7;
int q3, q4, q5, q6, q7;
int j = 0;
int z; for (z = 1; z < 128; ++z) { xt[z] = _mm_set1_epi32(z); }
for (q3 = startval; q3 < endval; ++q3)
{
if (q3 == 10 || q3 == 13) continue;
m3 = (m2 ^ q3) * H_PRIME;
s3 = _mm_mullo_epi32(_mm_xor_si128(s2, xt[q3]), hp);
for (q4 = 1; q4 < 128; ++q4)
{
if (q4 == 10 || q4 == 13) continue;
m4 = (m3 ^ q4) * H_PRIME;
s4 = _mm_mullo_epi32(_mm_xor_si128(s3, xt[q4]), hp);
for (q5 = 1; q5 < 128; ++q5)
{
if (q5 == 10 || q5 == 13) continue;
m5 = (m4 ^ q5) * H_PRIME;
s5 = _mm_mullo_epi32(_mm_xor_si128(s4, xt[q5]), hp);
for (q6 = 1; q6 < 128; ++q6)
{
if (q6 == 10 || q6 == 13) continue;
m6 = (m5 ^ q6) * H_PRIME;
s6 = _mm_mullo_epi32(_mm_xor_si128(s5, xt[q6]), hp);
for (q7 = 1; q7 < 128; ++q7)
{
if (q7 == 10 || q7 == 13) continue;
m7 = (m6 ^ q7) * H_PRIME;
s7 = _mm_mullo_epi32(_mm_xor_si128(s6, xt[q7]), hp);
UNROLL(1)
UNROLL(96)
UNROLL(2)
UNROLL(3)
UNROLL(4)
UNROLL(5)
UNROLL(6)
UNROLL(7)
UNROLL(8)
UNROLL(9)
UNROLL(11)
UNROLL(12)
UNROLL(14)
// ... snipped UNROLL(15) .. UNROLL(125)
UNROLL(126)
UNROLL(127)
}
}
}
}
}
return j;
}
Most of this speed up was from the manual unroll of the inner loop.
Since I am very new to Intel SSE/AVX instructions, please let me know if you just saw something above that made you pull a face.
Intel VTune reports the biggest hot spot occurs at the line:
UNROLL(1)
In the corresponding assembly code, the hot spots are shown below:
mov eax, ecx 0.917s
mov edx, ecx 0.062s
xor rax, 0x1
movdqa xmmword ptr [rsp+0x20], xmm0
mov ebx, dword ptr [rsp+0x2c] 0.155s
mov r11d, dword ptr [rsp+0x28] 0.949s
movsx ecx, byte ptr [rax+rdi*1] 0.156s
mov r9d, dword ptr [rsp+0x24] 91.132s
mov r8d, dword ptr [rsp+0x20] 0.233s
test ecx, ecx
jz 0x14000225b
---
mov eax, r8d 0.342s
xor rax, 0x1 0.047s
movsx eax, byte ptr [rax+r13*1] 0.124s
cmp ecx, eax 12.631s
jnz 0x14000225b
---
mov eax, r9d
xor rax, 0x1
movsx eax, byte ptr [rax+r12*1]
cmp ecx, eax 0.016s
jnz 0x14000225b
This seems like a "data locality" problem to me. Each time through the inner loop, the value of m7 varies wildly and unpredictably, in a 4 GB range, so you will likely get a cache miss for the first UNROLL(1) when looking up qqm=bytevecM[m7^1].
Since the subsequent UNROLL(2)..UNROLL(127) xors m7 with 2..127, you will usually get a cache hit for the rest of the UNROLLs. Curiously, changing the order of the UNROLLs, by moving UNROLL(96) to right after UNROLL(1), gave a significant speed up.
I understand that reading a byte from memory results in filling the (64-byte) cache line that contains the byte.
Since I am very new to this area, I welcome any advice or good references on how to speed up memory lookups, especially when dealing with large tables (in my case, 4 GB tables).
I can't see an obvious way to improve data locality with the above algorithm; suggestions on how that might be achieved are also welcome.
Update 29 March 2013
Since this node was written, I've been able to further reduce the running time from three hours down to 20 minutes, as shown below.
Adding a 4 MB bitmap for each 4 GB bytevec reduced it to around 40 minutes, further halved by adding some _mm_prefetch calls.
Note that the basic algorithm remains unchanged: data locality was improved by adding the bitmaps; latency was reduced by adding the _mm_prefetch calls.
Suggestions for further performance improvements welcome. The improved program follows:
#include <emmintrin.h>
#include <smmintrin.h>
#define H_PRIME 1000003
#define UNROLL(qx) qqm = bytevecM[m7 ^ qx]; \
if (qqm != 0 \
&& qqm == bytevecD[d7 ^ qx]) { \
_mm_prefetch(&bytevecC[c7 ^ qx], _MM_HINT_T0); \
qd[nqd++] = qqm; qd[nqd++] = qx; }
int inner(
const unsigned char* bitvecM,
const unsigned char* bitvecD,
const unsigned char* bitvecC,
const unsigned char* bitvecL,
const unsigned char* bitvecX,
const unsigned char* bitvecV,
const unsigned char* bitvecI,
const unsigned char* bytevecM,
const unsigned char* bytevecD,
const unsigned char* bytevecC,
const unsigned char* bytevecL,
const unsigned char* bytevecX,
int startval, int endval,
int m2, int d2, int c2, int l2, int x2, int v2, int i2,
__m128i* ps
)
{
__declspec(align(16)) __m128i s2 = _mm_set_epi32(i2, v2, x2, l2);
__declspec(align(16)) __m128i hp = _mm_set1_epi32(H_PRIME);
__declspec(align(16)) __m128i xt[128];
__declspec(align(16)) __m128i s3, s4, s5, s6;
int m3, m4, m5, m6;
int d3, d4, d5, d6;
int c3, c4, c5, c6;
unsigned int m7, d7, c7, l7, x7, v7, i7;
int qqm;
int q3, q4, q5, q6, q7, q8;
int iret = 0;
unsigned int qf[128*4];
int nqf;
int qz;
int qd[128*2];
int nqd;
int cnt;
int qh[128*3];
int nqh;
int qi[128*5];
int nqi;
unsigned int m7arr[128];
unsigned int d7arr[128];
const size_t* pbvM = (size_t*)bitvecM;
const size_t* pbvD = (size_t*)bitvecD;
const size_t* pbvC = (size_t*)bitvecC;
const size_t* pbvL = (size_t*)bitvecL;
const size_t* pbvX = (size_t*)bitvecX;
const size_t* pbvV = (size_t*)bitvecV;
const size_t* pbvI = (size_t*)bitvecI;
int z; for (z = 1; z < 128; ++z) { xt[z] = _mm_set1_epi32(z); }
for (q3 = startval; q3 < endval; ++q3)
{
if (q3 == 10 || q3 == 13) continue;
m3 = (m2 ^ q3) * H_PRIME;
d3 = (d2 ^ q3) * H_PRIME;
c3 = (c2 ^ q3) * H_PRIME;
s3 = _mm_mullo_epi32(_mm_xor_si128(s2, xt[q3]), hp);
for (q4 = 1; q4 < 128; ++q4)
{
if (q4 == 10 || q4 == 13) continue;
m4 = (m3 ^ q4) * H_PRIME;
d4 = (d3 ^ q4) * H_PRIME;
c4 = (c3 ^ q4) * H_PRIME;
s4 = _mm_mullo_epi32(_mm_xor_si128(s3, xt[q4]), hp);
for (q5 = 1; q5 < 128; ++q5)
{
if (q5 == 10 || q5 == 13) continue;
m5 = (m4 ^ q5) * H_PRIME;
d5 = (d4 ^ q5) * H_PRIME;
c5 = (c4 ^ q5) * H_PRIME;
s5 = _mm_mullo_epi32(_mm_xor_si128(s4, xt[q5]), hp);
for (q6 = 1; q6 < 128; ++q6)
{
if (q6 == 10 || q6 == 13) continue;
m6 = (m5 ^ q6) * H_PRIME;
d6 = (d5 ^ q6) * H_PRIME;
c6 = (c5 ^ q6) * H_PRIME;
s6 = _mm_mullo_epi32(_mm_xor_si128(s5, xt[q6]), hp);
for (q7 = 1; q7 < 128; ++q7)
{
if (q7 == 10 || q7 == 13) continue;
m7arr[q7] = (unsigned int)( (m6 ^ q7) * H_PRIME );
_mm_prefetch((const char*)(&pbvM[m7arr[q7] >> 13]), _MM_HINT_T0);
d7arr[q7] = (unsigned int)( (d6 ^ q7) * H_PRIME );
_mm_prefetch((const char*)(&pbvD[d7arr[q7] >> 13]), _MM_HINT_T0);
}
nqh = 0;
for (q7 = 1; q7 < 128; ++q7)
{
if (q7 == 10 || q7 == 13) continue;
if ( (pbvM[m7arr[q7] >> 13] & ((size_t)1 << ((m7arr[q7] >> 7) & 63))) == 0 ) continue;
if ( (pbvD[d7arr[q7] >> 13] & ((size_t)1 << ((d7arr[q7] >> 7) & 63))) == 0 ) continue;
c7 = (unsigned int)( (c6 ^ q7) * H_PRIME );
_mm_prefetch((const char*)(&pbvC[c7 >> 13]), _MM_HINT_T0);
l7 = (unsigned int)( (s6.m128i_i32[0] ^ q7) * H_PRIME );
_mm_prefetch((const char*)(&pbvL[l7 >> 13]), _MM_HINT_T0);
qh[nqh++] = q7;
qh[nqh++] = c7;
qh[nqh++] = l7;
}
nqi = 0;
cnt = 0;
while (cnt < nqh)
{
q7 = qh[cnt++];
c7 = qh[cnt++];
l7 = qh[cnt++];
if ( (pbvC[c7 >> 13] & ((size_t)1 << ((c7 >> 7) & 63))) == 0 ) continue;
if ( (pbvL[l7 >> 13] & ((size_t)1 << ((l7 >> 7) & 63))) == 0 ) continue;
x7 = (unsigned int)( (s6.m128i_i32[1] ^ q7) * H_PRIME );
_mm_prefetch((const char*)(&pbvX[x7 >> 13]), _MM_HINT_T0);
v7 = (unsigned int)( (s6.m128i_i32[2] ^ q7) * H_PRIME );
_mm_prefetch((const char*)(&pbvV[v7 >> 13]), _MM_HINT_T0);
qi[nqi++] = q7;
qi[nqi++] = c7;
qi[nqi++] = l7;
qi[nqi++] = x7;
qi[nqi++] = v7;
}
nqf = 0;
cnt = 0;
while (cnt < nqi)
{
q7 = qi[cnt++];
c7 = qi[cnt++];
l7 = qi[cnt++];
x7 = qi[cnt++];
v7 = qi[cnt++];
if ( (pbvX[x7 >> 13] & ((size_t)1 << ((x7 >> 7) & 63))) == 0 ) continue;
if ( (pbvV[v7 >> 13] & ((size_t)1 << ((v7 >> 7) & 63))) == 0 ) continue;
i7 = (unsigned int)( (s6.m128i_i32[3] ^ q7) * H_PRIME );
if ( (pbvI[i7 >> 13] & ((size_t)1 << ((i7 >> 7) & 63))) == 0 ) continue;
_mm_prefetch(&bytevecD[d7arr[q7] & 0xffffff80], _MM_HINT_T0);
_mm_prefetch(&bytevecD[64+(d7arr[q7] & 0xffffff80)], _MM_HINT_T0);
qf[nqf++] = q7;
qf[nqf++] = c7;
qf[nqf++] = l7;
qf[nqf++] = x7;
}
cnt = 0;
while (cnt < nqf)
{
q7 = qf[cnt];
cnt += 4;
_mm_prefetch(&bytevecM[m7arr[q7] & 0xffffff80], _MM_HINT_T0);
_mm_prefetch(&bytevecM[64+(m7arr[q7] & 0xffffff80)], _MM_HINT_T0);
}
qz = 0;
while (qz < nqf)
{
q7 = qf[qz++];
c7 = qf[qz++];
l7 = qf[qz++];
x7 = qf[qz++];
m7 = m7arr[q7];
d7 = d7arr[q7];
nqd = 0;
UNROLL(1)
UNROLL(96)
UNROLL(2)
UNROLL(3)
UNROLL(4)
UNROLL(5)
UNROLL(6)
UNROLL(7)
UNROLL(8)
UNROLL(9)
UNROLL(11)
UNROLL(12)
UNROLL(14)
// ... snipped UNROLL(15) .. UNROLL(125)
UNROLL(126)
UNROLL(127)
if (nqd == 0) continue;
cnt = 0;
while (cnt < nqd)
{
qqm = qd[cnt++];
q8 = qd[cnt++];
if (qqm == bytevecC[c7 ^ q8]
&& qqm == bytevecL[l7 ^ q8]
&& qqm == bytevecX[x7 ^ q8])
{
ps[iret++] = _mm_set_epi16(0, qqm, q8, q7, q6, q5, q4, q3);
}
}
}
}
}
}
}
return iret;
}
回答1:
If you truly need to access the memory in random fashion then there is only really window dressing that you can do to save time on main memory stalls. The hardware simply cannot randomly jump to a memory address and have data available within a single cycle. Of course there are always 'tricks', often dependent on exact chipset. None is springing to mind for this question though.
For very high performance systems, you often need to look carefully at the algorithm or trick techniques to avoid doing work. Taking this question in the context of your other question Using C/Intel assembly, what is the fastest way to test if a 128-byte memory block contains all zeros? AND using the fact that most of the time the memory is zero, you could simply set 1 bit per 128 bytes or even 512bytes and use that as a short circuit for testing the complete block. This can be seen as a http://en.wikipedia.org/wiki/Bloom_filter although I prefer to think of it as an array of 1 bit per entry.
For a 4Gb lookup table, you will need 31.5Mb at 1 bit per 128 bytes, or 7.8Mb at 1 bit per 4x 128 bytes etc, etc. the aim here is to try and get the smaller presence indicator to be more likely to be in cache. Of course the lookup table writer is going to incur an additional memory write.
An alternative technique, which might not be suitable at all depending on how data is written to the lookup arrays, would be to store the addresses of values in a sorted array rather than the array itself. This would allow you to use 320Mb for the address values (if my maths is correct) and allow you to use binary search, but can result in good cache effectiveness for initial probes.
Or, rather than doing the final inner loop immediately, save the input variables required, and proceed to next enclosing loop iteration. When finished sort this list of inner loop hits by memory address and then do them all in their own loop. There is a lot of gotchas with this technique and it only really helps if you are likely to reuse same memory block, but this method is used real-world
回答2:
Few ideas:
I'd try a variation on this piece:
#define UNROLL(q8) qqm = bytevecM[(unsigned int)(m7 ^ q8)]; \
if (qqm != 0 \
&& qqm == bytevecD[(unsigned int)(s7.m128i_i32[0] ^ q8)] \
&& qqm == bytevecC[(unsigned int)(s7.m128i_i32[1] ^ q8)] \
&& qqm == bytevecL[(unsigned int)(s7.m128i_i32[2] ^ q8)] \
&& qqm == bytevecX[(unsigned int)(s7.m128i_i32[3] ^ q8)]) { \
ps[j++] = _mm_set_epi16(0, qqm, q8, q7, q6, q5, q4, q3); }
which might perform better if you make the following changes:
- use
PXOR
instead of four separateXOR
instructions - pull the
_mm_set_epi16()
out of the macro, i.e. do__m128i pp = _mm_set_epi16(0, 0, 0, q7, q6, q5, q4, q3);
first thing in the innermost loop. - insert the iteration counter
q8
and theqqm
where/when needed
With those changes, it'll become something like:
#define UNROLL(q8) \
qqm = bytevecM[(unsigned int)(m7 ^ q8)]; \
if (qqm) { \
pp = _mm_insert_epi16(pp, q8, 2); \
__m128i qq = _mm_xor_si128(_mm_set1_epi32(q8), s7)); \
if (qqm == bytevecD[qq.m128i_i32[0]] && \
qqm == bytevecC[qq.m128i_i32[1]] && \
qqm == bytevecL[qq.m128i_i32[2]] && \
qqm == bytevecX[qq.m128i_i32[3]])) \
ps[j++] = _mm_insert_epi16(pp, qqm, 1); \
}
Also, the fact that you're getting a speedup from putting the UNROLL(96)
early means you're prepopulating the 2nd cacheline(s) for all the 128-Byte sections of the bytevec
s at that point. This should also be achievable by adding:
_mm_prefetch(&bytevecM[m7 | 0x60], _MM_HINT_T0);
_mm_prefetch(&bytevecD[s7.m128i_i32[0] | 0x60], _MM_HINT_T0);
_mm_prefetch(&bytevecC[s7.m128i_i32[1] | 0x60], _MM_HINT_T0);
_mm_prefetch(&bytevecL[s7.m128i_i32[2] | 0x60], _MM_HINT_T0);
_mm_prefetch(&bytevecX[s7.m128i_i32[3] | 0x60], _MM_HINT_T0);
before the first UNROLL(1)
.
来源:https://stackoverflow.com/questions/15036169/how-to-improve-memory-performance-data-locality-of-64-bit-c-intel-assembly-progr