エラトステネスの篩
1億(108)まで篩ってみると1秒ほど差が出た。アセンブリの方が1.762秒で普通の方が2.772秒。(Core2Quad Q9450)
#define MAX 100000000 #define SQRT_MAX 10000 char flags[MAX/8+1]; void Eratosthenes() { __asm__ ("movl $2, %%ecx \n\t" "erloop1: \n\t" "cmpl %2, %%ecx \n\t" "jg exiter \n\t" "bt %%ecx, %0 \n\t" "jc erskip \n\t" "movl %%ecx, %%eax \n\t" "imul %%ecx, %%eax \n\t" "erloop2: \n\t" "cmpl %1, %%eax \n\t" "jg erskip \n\t" "bts %%eax, %0 \n\t" "addl %%ecx, %%eax \n\t" "jmp erloop2 \n\t" "erskip: \n\t" "incl %%ecx \n\t" "jmp erloop1 \n\t" "exiter: \n\t" :"=m"(flags) :"r"(MAX),"r"(SQRT_MAX)); }
#define MAX 100000000 #define SQRT_MAX 10000 static char flags[MAX+1]; void EratosthenesNaive() { int i, j; for(i = 2; i <= SQRT_MAX; ++i) { if(!flags[i]) { for(j = i*i; j <= MAX; j += i) flags[j] = 1; } } }