If you count how often each nibble (16 possibilities) occurs at each offset (16 possibilities), you can easily sum the results. And those 256 sums are easily kept:
unsigned long nibble_count[16][16]; // E.g. 0x000700B0 corresponds to [4][7] and [2][B]
unsigned long bitcount[64];
void CountNibbles(uint64 bits) {
// Count nibbles
for (int i = 0; i != 16; ++i) {
nibble_count[i][bits&0xf]++;
bits >>= 4;
}
}
void SumNibbles() {
for (int i = 0; i != 16; ++i) {
for (int nibble = 0; nibble != 16; ++nibble) {
for(int bitpos = 0; bitpos != 3; ++bitpos) {
if (nibble & (1<