/* Simple benchmark harness for different popcount implementations This code is derived from original source code by a1k0n http://www.a1k0n.net/temp/popcnt.c.txt and found via a comment by a1k0n at http://www.reddit.com/r/programming/info/22p5v/comments Additional popcount implementations by Andrew Dalke. For more details see http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html Change Log: Version: 1.1 (5 July 2008) - Added Bitslice(7) and Bitslice(24) implementations - Moved the slowest implementations to the end Version: 1.0 (3 July 2008) - Initial release */ #include #include #include #define BUFSIZE (8*1048576) /* bufsize in bytes: 8 megs of junk */ #define ITERATIONS 300 /* count the bits in a bunch of junk and time it */ #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255) #define BX_(x) ((x) - (((x)>>1)&0x77777777) \ - (((x)>>2)&0x33333333) \ - (((x)>>3)&0x11111111)) unsigned long get_usecs(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec*1000000+tv.tv_usec; } static inline int popcount_fbsd1(unsigned *buf, int n) { int cnt=0; do { unsigned m = *buf++; m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); cnt += m; } while(--n); return cnt; } static inline int popcount_fbsd2(unsigned *buf, int n) { int cnt=0; do { unsigned v = *buf++; v -= ((v >> 1) & 0x55555555); v = (v & 0x33333333) + ((v >> 2) & 0x33333333); v = (v + (v >> 4)) & 0x0F0F0F0F; v = (v * 0x01010101) >> 24; cnt += v; } while(--n); return cnt; } unsigned char lut[65536]; static inline int popcount_lut16(unsigned *buf, int n) { int cnt=0; do { cnt += lut[(*buf)&65535]; cnt += lut[(*buf)>>16]; buf++; } while(--n); return cnt; } static inline int popcount_lut8(unsigned *buf, int n) { int cnt=0; unsigned int i; do { i = *buf; cnt += lut[i&255]; cnt += lut[i>>8&255]; cnt += lut[i>>16&255]; cnt += lut[i>>24]; /* APD: removed the unneeded &255 */ buf++; } while(--n); return cnt; } /* modification of the previous code */ static inline int popcount_lut8_v2(unsigned *buf, int n) { int cnt=0; unsigned int i; do { i = *buf; /* Small difference in code, big difference in performance on my machine */ /* The difference is compiler dependent. */ cnt += (lut[i&255] + lut[i>>8&255] + lut[i>>16&255] + lut[i>>24]); buf++; } while(--n); return cnt; } /* This is "Counting bits set, in parallel" from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel I took the algorithms from http://en.wikipedia.org/wiki/Hamming_weight Also mentioned in http://gcc.gnu.org/bugzilla/attachment.cgi?id=15529 (which means it may be in a GCC near you soon). */ typedef unsigned long long uint64; //assume this gives 64-bits const uint64 m1 = 0x5555555555555555ULL; //binary: 0101... const uint64 m2 = 0x3333333333333333ULL; //binary: 00110011.. const uint64 m4 = 0x0f0f0f0f0f0f0f0fULL; //binary: 4 zeros, 4 ones ... const uint64 m8 = 0x00ff00ff00ff00ffULL; //binary: 8 zeros, 8 ones ... const uint64 m16 = 0x0000ffff0000ffffULL; //binary: 16 zeros, 16 ones ... const uint64 m32 = 0x00000000ffffffffULL; //binary: 32 zeros, 32 ones const uint64 hff = 0xffffffffffffffffULL; //binary: all ones const uint64 h01 = 0x0101010101010101ULL; //the sum of 256 to the power of 0,1,2,3... /* Implement 'popcount_2' from Wikipedia */ /* """This uses fewer arithmetic operations than any other known implementation on machines with slow multiplication. It uses 17 arithmetic operations.""" */ static inline int popcount_wikipedia_2(uint64 *buf, int n) { int cnt=0; uint64 x; do { x = *buf; x -= (x >> 1) & m1; //put count of each 2 bits into those 2 bits x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits x = (x + (x >> 4)) & m4; //put count of each 8 bits into those 8 bits x += x >> 8; //put count of each 16 bits into their lowest 8 bits x += x >> 16; //put count of each 32 bits into their lowest 8 bits x += x >> 32; //put count of each 64 bits into their lowest 8 bits cnt += x & 0x7f; buf++; } while (--n); return cnt; } /* Implement 'popcount_3' from Wikipedia */ /* """This uses fewer arithmetic operations than any other known implementation on machines with fast multiplication. It uses 12 arithmetic operations, one of which is a multiply.""" */ static inline int popcount_wikipedia_3(uint64 *buf, int n) { int cnt=0; uint64 x; do { x = *buf; x -= (x >> 1) & m1; //put count of each 2 bits into those 2 bits x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits x = (x + (x >> 4)) & m4; //put count of each 8 bits into those 8 bits cnt += (x * h01)>>56; //returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24)+... buf++; } while (--n); return cnt; } /* How good are GCC's implementations? */ static inline int popcount_gcc_int(unsigned *buf, int n) { int cnt=0; while (n--) { cnt += __builtin_popcount(*buf++); } return cnt; } static inline int popcount_gcc_long_long(unsigned long long *buf, int n) { int cnt=0; while (n--) { cnt += __builtin_popcountll(*buf++); } return cnt; } /* http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive */ static inline int popcount_naive(unsigned *buf, int n) { int cnt=0; unsigned v; do { v = *buf; while (v) { cnt += v&1; v >>= 1; } buf++; } while (--n); return cnt; } /* http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan */ /* I'm calling it "Wegner" beause that's the earliest citation but more people know it by Kernighan because of its use in K&R */ static inline int popcount_wegner(unsigned *buf, int n) { int cnt=0; unsigned v; while (n--) { v = *buf; while (v) { cnt++; v &= v-1; /* clear the least significant bit set */ } buf++; } return cnt; } /* http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 */ /* I named it "Anderson" after the cited author */ static inline int popcount_anderson(unsigned *buf, int n) { int cnt=0; uint64 v; while (n--) { v = *buf; cnt += ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f; cnt += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f; cnt += ((v >> 24) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f; buf++; } return cnt; } /* This is the HAKMEM algorithm, used in X11 Using koders.com I found a few matches, including this one in xc/programs/Xserver/dix/colormap.c http://www.koders.com/c/fidF2F5E1AB476D73F56DD86D6F26AD03EECD6781B0.aspx?s=hakmem+169#L2068 / * next 3 magic statements count number of ones (HAKMEM #169) * / pixel = (mask >> 1) & 033333333333; pixel = mask - pixel - ((pixel >> 1) & 033333333333); if ((((pixel + (pixel >> 3)) & 030707070707) % 077) != planes) continue; Using code from http://compilers.iecc.com/comparch/article/95-07-065 */ static inline int popcount_hakmem_169(unsigned *buf, int n) { int cnt=0; unsigned tmp, w; while (n--) { w = *buf; tmp = w - ((w >> 1) & 033333333333) - ((w >> 2) & 011111111111); cnt += ((tmp + (tmp >> 3)) & 030707070707) % 63; buf++; } return cnt; } void init_lut(void) { int i; for(i=0;i<65536;i++) { lut[i] = BITCOUNT(i); } } #define ROLADC8 __asm__("rolb %%al; "\ "adcl $0,%1;": "=a"(c), "=r"(cnt) : "0"(c), "1"(cnt)) static inline int popcount_roladc8(unsigned char *buf, int n) { int cnt=0; do { char c = *buf++; ROLADC8; ROLADC8; ROLADC8; ROLADC8; ROLADC8; ROLADC8; ROLADC8; ROLADC8; } while(--n); return cnt; } static inline int popcount_roladc32(unsigned *buf, int n) { int cnt=0; do { unsigned c = *buf++; #define ROLADC32 __asm__("roll %0; "\ "adcl $0,%1;": "=r"(c), "=r"(cnt) : "0"(c), "1"(cnt)) ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; ROLADC32; } while(--n); return cnt; } /* This comes from Terje Mathisen in a post to comp.arch.arithmetic */ /* See http://www.ciphersbyritter.com/NEWS4/BITCT.HTM */ /* This is a bit-slice solution processing 7 words at a time */ #define FULL_ADD(c1, c0, w0, w1, s1, s2) w1 = s1; c0 = w0; w0 &= w1; \ c0 ^= w1; w1 = s2; c1 = c0; c0 ^= w1; c1 &= w1; c1 |= w0 #define MASK55 0x55555555 #define MASK33 0x33333333 #define MASK0F 0x0f0f0f0f unsigned count_bits7(unsigned *src) { unsigned c0, c1, t0, t1, d0, d1, e0, e1, f1, f2; t0 = src[0]; FULL_ADD(c1, c0, t0, t1, src[1], src[2]); // c1:c0 Up to 4 live vars+src[] FULL_ADD(d1, d0, c0, t1, src[3], src[4]); // d1:d0, c1 Up to 5 live vars+src[] FULL_ADD(e1, e0, d0, t1, src[5], src[6]); // e1:e0, d1, c1 Up to 6 live vars+src[] FULL_ADD(f2, f1, c1, t1, d1, e1); // f2:f1:e0 e0 -= (e0 >> 1) & MASK55; // 2 bits, 0-2 f1 -= (f1 >> 1) & MASK55; f2 -= (f2 >> 1) & MASK55; e0 = (e0 & MASK33) + ((e0 >> 2) & MASK33); // 4 bits, 0-4 f1 = (f1 & MASK33) + ((f1 >> 2) & MASK33); f2 = (f2 & MASK33) + ((f2 >> 2) & MASK33); e0 += e0 >> 4; f1 += f1 >> 4; f2 += f2 >> 4; // 4 bits, 0-8 e0 &= MASK0F; f1 &= MASK0F; f2 &= MASK0F; // 8 bits, 0-8 e0 += (f1 << 1) + (f2 << 2); // 8 bits, 0-8+16+32 = 56 e0 += e0 >> 8; // 8 bits, 0-112 e0 += e0 >> 16; // 8 bits, 0-224 return e0 & 255; } /* Process 7 words at a time, plus a bit for the remainder */ static inline int popcount_7words(unsigned *buf, int n) { int cnt=0, i; for (i=0; i>1)&0x55555555; count1 = count1 - ((count1 >> 1) & 0x55555555); count2 = count2 - ((count2 >> 1) & 0x55555555); count1+=half1; count2+=half2; count1 = (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333); count2 = (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333); count1+=count2; count1 = (count1&0x0F0F0F0F)+ ((count1 >> 4) & 0x0F0F0F0F); count1 = count1 + (count1 >> 8); count1 = count1 + (count1 >> 16); return count1 & 0x000000FF; } static inline int merging3(const unsigned *data) { unsigned count1,count2,half1,half2,acc=0; int i; for(i=0;i<24;i+=3) { count1=data[i]; count2=data[i+1]; //w = data[i+2]; half1=data[i+2]&0x55555555; half2=(data[i+2]>>1)&0x55555555; count1 = count1 - ((count1 >> 1) & 0x55555555); count2 = count2 - ((count2 >> 1) & 0x55555555); count1+=half1; count2+=half2; count1 = (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333); count1 += (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333); acc += (count1 & 0x0F0F0F0F)+ ((count1>>4) &0x0F0F0F0F); } acc = (acc & 0x00FF00FF)+ ((acc>>8)&0x00FF00FF); acc = acc + (acc >> 16); return acc & 0x00000FFFF; } /* count 24 words at a time, then 3 at a time, then 1 at a time */ static inline int popcount_24words(unsigned *buf, int n) { int cnt=0, i; for (i=0; i