エラトステネスの篩 - TopCoderとJ言語と時々F#

1億(10⁸)まで篩ってみると1秒ほど差が出た。アセンブリの方が1.762秒で普通の方が2.772秒。(Core2Quad Q9450)

#define MAX 100000000
#define SQRT_MAX 10000
char flags[MAX/8+1];

void Eratosthenes()
{
  __asm__ 
    ("movl $2, %%ecx  \n\t"
     "erloop1:  \n\t"
     "cmpl %2, %%ecx  \n\t"
     "jg exiter  \n\t"
     "bt %%ecx, %0  \n\t"
     "jc erskip  \n\t"
     "movl %%ecx, %%eax  \n\t"
     "imul %%ecx, %%eax  \n\t"
     "erloop2:  \n\t"
     "cmpl %1, %%eax  \n\t"
     "jg erskip  \n\t"
     "bts %%eax, %0  \n\t"
     "addl %%ecx, %%eax  \n\t"
     "jmp erloop2  \n\t"
     "erskip:  \n\t"
     "incl %%ecx  \n\t"
     "jmp erloop1  \n\t"
     "exiter: \n\t"
     :"=m"(flags)
     :"r"(MAX),"r"(SQRT_MAX));
}

#define MAX 100000000
#define SQRT_MAX 10000
static char flags[MAX+1];

void EratosthenesNaive()
{
  int i, j;
  for(i = 2; i <= SQRT_MAX; ++i) {
    if(!flags[i]) {
      for(j = i*i; j <= MAX; j += i)
	flags[j] = 1; 
    }
  }
}