diff -ru mpeg2enc.orig/MPEG.avi2yuv mpeg2enc/MPEG.avi2yuv
--- mpeg2enc.orig/MPEG.avi2yuv	Fri Apr  9 17:59:14 1999
+++ mpeg2enc/MPEG.avi2yuv	Mon Apr 10 20:01:30 2000
@@ -1,5 +1,5 @@
 recorded from video , 25 frames/sec
-/video/Red_4    /* name of source files */
+/mnt/test    /* name of source files */
 -         /* name of reconstructed images ("-": don't store) */
 -         /* name of intra quant matrix file     ("-": default matrix) */ 
 -         /* name of non intra quant matrix file ("-": default matrix) */
diff -ru mpeg2enc.orig/Makefile mpeg2enc/Makefile
--- mpeg2enc.orig/Makefile	Fri Apr  9 18:15:43 1999
+++ mpeg2enc/Makefile	Sat Apr 15 12:22:46 2000
@@ -36,7 +36,10 @@
 # GNU gcc
 #
 CC = gcc
-CFLAGS = -O3 -ffast-math -fomit-frame-pointer -funroll-loops -march=k6 -mcpu=k6 -Wall -I/home/uhecht/mpeg_encode/jpeg
+#CFLAGS = -DHAVE_MMX -O3 -ffast-math -fomit-frame-pointer -funroll-loops -march=k6 -mcpu=k6 -Wall -I/home/uhecht/mpeg_encode/jpeg
+#CFLAGS = -DAMD -DFDCTINT -DHAVE_MMX -O3 -ffast-math -fomit-frame-pointer -funroll-loops -malign-loops=2 -malign-jumps=2 -march=k6 -mcpu=k6 -Wall -I/home/uhecht/mpeg_encode/jpeg
+CFLAGS = -DAMD -DFDCTINT -DHAVE_MMX -O3 -fomit-frame-pointer -funroll-loops -malign-loops=2 -malign-jumps=2 -march=k6 -mcpu=k6 -Wall -I/home/uhecht/mpeg_encode/jpeg
+#CFLAGS = -Dinline=static -pg -DAMD -DFDCTINT -DHAVE_MMX -O2 -ffast-math -funroll-loops -malign-loops=2 -malign-jumps=2 -march=k6 -mcpu=k6 -Wall -I/home/uhecht/mpeg_encode/jpeg
 
 OBJ = mpeg2enc.o conform.o putseq.o putpic.o puthdr.o putmpg.o putvlc.o\
       putbits.o motion.o predict.o readpic.o writepic.o transfrm.o fdctref.o\
Binary files mpeg2enc.orig/conform.o and mpeg2enc/conform.o differ
Only in mpeg2enc: eins
diff -ru mpeg2enc.orig/fdctref.c mpeg2enc/fdctref.c
--- mpeg2enc.orig/fdctref.c	Fri Apr  9 18:14:20 1999
+++ mpeg2enc/fdctref.c	Sat Apr 15 06:19:35 2000
@@ -46,8 +46,16 @@
 /* private data */
 /*
 static double c[8][8]; */ /* transform coefficients */
+#ifdef FDCTINT
+static int c[8][8];
+#ifdef AMD
+static int d[8][8];
+#endif
+#else
 static float c[8][8];
+#endif
 
+#ifndef FDCTINT
 void init_fdct()
 {
   int i, j;
@@ -64,7 +72,79 @@
       c[i][j] = s * cos((PI/8.0)*i*(j+0.5));
   }
 }
+#else
+#ifdef AMD
+static int DIVIDE=65536;
 
+void mmx_create_table(int* in, int* out)
+{
+	asm("
+		.align 8
+		movd (%%ecx),%%mm0;
+		pi2fd %%mm0, %%mm0;
+		movd DIVIDE, %%mm1;
+		pi2fd %%mm1, %%mm1;
+		pfrcp %%mm1, %%mm1;
+		pfmul %%mm1, %%mm0;
+		movd %%mm0, (%%ebx);
+		femms;
+	"
+	: : "b" (out), "c" (in));
+}
+static int pointfive=0;
+static int one=1;
+static int two=2;
+
+void mmx_setpointfive(void)
+{
+	asm("
+		.align 8
+		movd one, %%mm0;
+		pi2fd %%mm0, %%mm0;
+		movd two, %%mm1;
+		pi2fd %%mm1, %%mm1;
+		pfrcp %%mm1, %%mm1;
+		pfmul %%mm1, %%mm0;
+		movd %%mm0, pointfive;
+		femms;
+	" : : );
+}
+#endif
+
+void init_fdct()
+{
+  int i, j;
+  /*
+  double s;
+  */
+  float s;
+#ifdef AMD
+  int tmp;
+#endif
+  for (i=0; i<8; i++)
+  {
+    s = (i==0) ? 0.3535533905932737622 /* sqrt(0.125) */ : 0.5;
+
+    for (j=0; j<8; j++)
+    {
+#ifndef AMD
+      c[i][j] = (s * cos((PI/8.0)*i*(j+0.5)))*65536.0;
+      //printf("%d ",c[i][j]);
+#else
+      tmp=(s * cos((PI/8.0)*i*(j+0.5)))*65536.0;      
+      printf("tmp %d\r",tmp);
+      mmx_create_table(&tmp,&d[i][j]);
+      //printf("%d\n",d[i][j]);
+#endif
+    }
+  }
+#ifdef AMD
+  mmx_setpointfive();
+#endif
+}
+#endif
+
+#ifndef FDCTINT
 void fdct(block)
 short *block;
 {
@@ -107,3 +187,251 @@
        */
     }
 }
+#else
+#ifdef AMD
+unsigned long long mul1=0x0001000100010001;
+
+inline void mmx_fdct(int* c, short* block, int* tmp)
+{
+#ifdef STABLE
+	asm("
+		.align 8
+		movq mul1, %%mm7;
+		movq (%%ecx), %%mm6; // load 4 elements from block
+		movq %%mm6, %%mm5;   // copy 4 unpack
+		pmulhw %%mm7, %%mm6;
+		pmullw %%mm7, %%mm5;
+		movq %%mm5, %%mm2;
+		punpcklwd %%mm6, %%mm5; // lower half
+		pi2fd %%mm5, %%mm4; // as float in mm4
+		punpckhwd %%mm6, %%mm2; // upper half
+		pi2fd %%mm2, %%mm5; // as float in mm5
+		movq (%%ebx), %%mm3; // lower half of table
+		movq 8(%%ebx), %%mm6; // upper half of table
+		pfmul %%mm4, %%mm3;
+		pfmul %%mm5, %%mm6;
+		pfacc %%mm3, %%mm6;
+
+		movq 8(%%ecx), %%mm2; // load 4 elements from block
+		movq %%mm2, %%mm5;   // copy 4 unpack
+		pmulhw %%mm7, %%mm2;
+		pmullw %%mm7, %%mm5;
+		movq %%mm5, %%mm3;
+		punpcklwd %%mm2, %%mm5; // lower half
+		pi2fd %%mm5, %%mm4; // as float in mm4
+		punpckhwd %%mm2, %%mm3; // upper half
+		pi2fd %%mm3, %%mm5; // as float in mm5
+		movq 16(%%ebx), %%mm3; // lower half of table
+		movq 24(%%ebx), %%mm2; // upper half of table
+		pfmul %%mm4, %%mm3;
+		pfmul %%mm5, %%mm2;
+		pfacc %%mm3, %%mm2;
+		pfadd %%mm2, %%mm6;
+		pf2id %%mm6, %%mm6;
+		movq %%mm6, %%mm5;
+		psrlq $32, %%mm5;
+		paddd %%mm5, %%mm6;
+		movd %%mm6, (%%edx);
+		femms;
+		"
+		:
+		: "b" (c), "c" (block), "d" (tmp)
+		: "st" );
+#else
+#warning EXP
+	asm("
+		.align 8
+		movq mul1, %%mm7;
+		movq (%%ecx), %%mm6; // load 4 elements from block
+		movq %%mm6, %%mm5;   // copy 4 unpack
+		pmulhw %%mm7, %%mm6;
+		pmullw %%mm7, %%mm5;
+		movq %%mm5, %%mm2;
+		punpcklwd %%mm6, %%mm5; // lower half
+		movq (%%ebx), %%mm3; // lower half of table
+		pi2fd %%mm5, %%mm4; // as float in mm4
+		punpckhwd %%mm6, %%mm2; // upper half
+		pi2fd %%mm2, %%mm5; // as float in mm5
+		movq 8(%%ebx), %%mm6; // upper half of table
+		pfmul %%mm4, %%mm3;
+		pfmul %%mm5, %%mm6;
+		movq 8(%%ecx), %%mm2; // load 4 elements from block
+		pfacc %%mm3, %%mm6;
+
+		movq %%mm2, %%mm5;   // copy 4 unpack
+		pmulhw %%mm7, %%mm2;
+		movq 16(%%ebx), %%mm3; // lower half of table
+		pmullw %%mm7, %%mm5;
+		movq %%mm5, %%mm1;
+		punpcklwd %%mm2, %%mm5; // lower half
+		pi2fd %%mm5, %%mm4; // as float in mm4
+		punpckhwd %%mm2, %%mm1; // upper half
+		pi2fd %%mm1, %%mm5; // as float in mm5
+		movq 24(%%ebx), %%mm2; // upper half of table
+		pfmul %%mm4, %%mm3;
+		pfmul %%mm5, %%mm2;
+		pfacc %%mm3, %%mm2;
+		pfadd %%mm2, %%mm6;
+		movq %%mm6, %%mm5;
+		psrlq $32, %%mm5;
+		pfadd %%mm5, %%mm6;
+		movd %%mm6, (%%edx);
+		//femms;
+		"
+		:
+		: "b" (c), "c" (block), "d" (tmp)
+		: "st" );
+#endif
+}
+
+inline void mmx_fdct2(int* c, int* block,int* res)
+{
+#ifdef STABLE2
+	asm("
+		.align 8
+		movq (%%ecx), %%mm4; // load 2 elements from block
+		movq (%%ebx), %%mm3; // lower half of table
+		pfmul %%mm4, %%mm3;
+		movq 8(%%ecx), %%mm5;
+		movq 8(%%ebx), %%mm6; // upper half of table
+		pfmul %%mm5, %%mm6;
+
+		movq 16(%%ecx), %%mm4; // load 2 elements from block
+		pfacc %%mm3, %%mm6;
+		movq 24(%%ecx), %%mm5;
+		movq 16(%%ebx), %%mm3; // lower half of table
+		movq 24(%%ebx), %%mm2; // upper half of table
+		pfmul %%mm4, %%mm3;
+		pfmul %%mm5, %%mm2;
+		pfacc %%mm3, %%mm2;
+
+		pfadd %%mm2, %%mm6;
+		movq %%mm6, %%mm5;
+		psrlq $32, %%mm5;
+		pfadd %%mm5, %%mm6;
+		pfadd %%mm0, %%mm6;
+		pf2id %%mm6, %%mm6;
+		movd %%mm6, (%%edx);
+		//femms;
+		"
+		:
+		: "b" (c), "c" (block), "d" (res)
+		);
+#else
+#warning EXP
+	asm("
+		.align 8
+		movq (%%ecx), %%mm4; // load 2 elements from block
+		movq (%%ebx), %%mm3; // lower half of table
+		pfmul %%mm4, %%mm3;
+		movq 8(%%ecx), %%mm5;
+		movq 8(%%ebx), %%mm6; // upper half of table
+		pfmul %%mm5, %%mm6;
+
+		movq 16(%%ecx), %%mm4; // load 2 elements from block
+		pfacc %%mm3, %%mm6;
+		movq 16(%%ebx), %%mm3; // lower half of table
+		movq 24(%%ecx), %%mm5;
+		pfmul %%mm4, %%mm3;
+		movq 24(%%ebx), %%mm2; // upper half of table
+		pfmul %%mm5, %%mm2;
+		pfacc %%mm3, %%mm2;
+
+		pfadd %%mm2, %%mm6;
+		movq %%mm6, %%mm5;
+		psrlq $32, %%mm5;
+		pfadd %%mm5, %%mm6;
+		pfadd %%mm0, %%mm6;
+		pf2id %%mm6, %%mm6;
+		movd %%mm6, (%%edx);
+		"
+		:
+		: "b" (c), "c" (block), "d" (res)
+		);
+#endif
+}
+#endif
+
+volatile static int res;
+
+void fdct(block)
+short *block;
+{
+  int i, j;
+#ifndef AMD
+  int k;
+  int s;
+#endif
+  /*
+  double s;
+  double tmp[64];
+  */
+  int tmp[64];
+  for (i=0; i<8; i++)
+    for (j=0; j<8; j++)
+    {
+#ifndef AMD
+      s = 0;
+
+      for (k=0; k<8; k++)
+        s += ((c[j][k])) * ((block[8*i+k]));
+#else
+      mmx_fdct(d[j],&block[8*i],&tmp[8*j+i]);
+#endif
+
+#ifndef AMD
+      tmp[8*j+i] = s;
+#endif
+    }
+
+#ifdef AMD
+  asm("
+  		.align 8
+  		movd pointfive, %%mm0;
+  " : : );
+#endif
+
+  for (j=0; j<8; j++)
+    for (i=0; i<8; i++)
+    {
+#ifndef AMD
+      s = 0;
+
+      for (k=0; k<8; k++)
+        s += ((c[i][k])>>8) * ((tmp[8*j+k])>>8);
+#else
+      mmx_fdct2(d[i],&tmp[8*j],&res);
+      block[8*i+j]=res;
+#endif
+
+#ifndef AMD
+      //block[8*i+j] = (int)floor(s+0.499999);
+      block[8*i+j] = (s+(0x7fff))>>16;
+      /*
+       * reason for adding 0.499999 instead of 0.5:
+       * s is quite often x.5 (at least for i and/or j = 0 or 4)
+       * and setting the rounding threshold exactly to 0.5 leads to an
+       * extremely high arithmetic implementation dependency of the result;
+       * s being between x.5 and x.500001 (which is now incorrectly rounded
+       * downwards instead of upwards) is assumed to occur less often
+       * (if at all)
+       */
+#else
+      asm("femms;");
+#endif
+    }
+}
+#endif
+
+#ifdef MAIN
+main()
+{
+	volatile int vektor1[32]={1189904896,1178856448,-968627200,-957578752,-957578752,-968627200,1178856448,1189904896};
+	volatile int vektor2[8]={42812,115897,-602414,115897,-425971,413432,1,1};
+	volatile int i,j;
+	for(j=0;j<200000;j++)
+	fdct(vektor1);
+	printf("%d\n",i);
+}
+#endif
+                
\ No newline at end of file
Only in mpeg2enc.orig: fdctref.c,v
Only in mpeg2enc: fdctref.c.int
Binary files mpeg2enc.orig/fdctref.o and mpeg2enc/fdctref.o differ
Only in mpeg2enc: fdctref.s
Only in mpeg2enc: gmon.out
Binary files mpeg2enc.orig/idct.o and mpeg2enc/idct.o differ
diff -ru mpeg2enc.orig/motion.c mpeg2enc/motion.c
--- mpeg2enc.orig/motion.c	Sat Jul 20 00:47:49 1996
+++ mpeg2enc/motion.c	Sun Apr 16 13:11:20 2000
@@ -1343,97 +1343,379 @@
  * h:         height of block (usually 8 or 16)
  * distlim:   bail out if sum exceeds this value
  */
-static int dist1(blk1,blk2,lx,hx,hy,h,distlim)
-unsigned char *blk1,*blk2;
-int lx,hx,hy,h;
-int distlim;
+
+
+void inline mmx_start_block()
 {
-  unsigned char *p1,*p1a,*p2;
-  int i,j;
-  int s,v;
+	asm(" 
+		.align 8
+		pxor %%mm7, %%mm7; 
+		pxor %%mm6, %%mm6;
+		" : : );
+}
 
-  s = 0;
-  p1 = blk1;
-  p2 = blk2;
+void inline mmx_absdiff(unsigned char *a, unsigned char *b)
+{
+#if STABLEABSDIFF
+#warning STABLEABSDIFF
+	asm("
+		.align 8
+		movq		(%%ebx),	%%mm0;     // Get first half of row1
+		movq		(%%ecx),	%%mm1;     // Get first half of row2
+		movq		%%mm0,		%%mm2;     // Make a copy of row1 for absdiff operation
+		movq		8(%%ebx),	%%mm3;     // Get second half of row1
+		psubusb		%%mm1,		%%mm0;     // Subtract the first halves one way
+		psubusb		%%mm2,		%%mm1;     // Subtract the other way
+		movq        8(%%ecx),   %%mm4;     // Get second half of row2
+		por			%%mm1,      %%mm0;     // Merge first half results
+		movq		%%mm3,		%%mm5;     // Copy for absdiff operation
+		movq		%%mm0,		%%mm1;     // Keep a copy
+		psubusb		%%mm4,		%%mm3;     // Subtract second halves one way
+		punpcklbw	%%mm6,		%%mm0;     // Unpack to higher precision for accumulation
+		psubusb		%%mm5,		%%mm4;     // Subtract the other way
+		psrlq		$32,		%%mm1;     // Shift registeres for accumulation
+		por			%%mm4,		%%mm3;     // merge results of 2nd half
+		punpcklbw	%%mm6,		%%mm1;     // unpack to higher precision for accumulation
+		movq		%%mm3,		%%mm4;     // keep a copy
+		punpcklbw	%%mm6,		%%mm3;     // unpack to higher precision for accumulation
+		paddw		%%mm0,		%%mm7;     // accumulate difference
+		psrlq		$32,		%%mm4;     // shift results for accumulation
+		paddw		%%mm1,		%%mm7;     // accumulate difference
+		punpcklbw	%%mm6,		%%mm4;     // unpack to higher precision for accumulation
+		paddw		%%mm3,		%%mm7;     // accumulate difference
+		paddw		%%mm4,		%%mm7;     // accumulate difference
+		"
+		: 
+		: "b" (a), "c" (b) 
+		: "st" );
+#else
+        asm("
+                .align 8
+                movq            (%%ebx),        %%mm0;     // Get first half
+                movq            %%mm0,          %%mm2;     // Make a copy of
+                movq            (%%ecx),        %%mm1;     // Get first half
+                psubusb         %%mm1,          %%mm0;     // Subtract the
+                psubusb         %%mm2,          %%mm1;     // Subtract the
+                movq            8(%%ebx),       %%mm3;     // Get second half
+                por                     %%mm1,      %%mm0;     // Merge first
+                 movq        8(%%ecx),   %%mm4;     // Get second half of row2
+                movq            %%mm3,          %%mm2;
+                psubusb         %%mm4,          %%mm3;
+                movq            %%mm0,          %%mm1;
+                psubusb         %%mm2,          %%mm4;
+                por             %%mm4,          %%mm3;
+                punpcklbw       %%mm6,          %%mm0;
+                movq            %%mm3,          %%mm4;
+                paddw           %%mm0,          %%mm7;
+                punpckhbw       %%mm6,          %%mm1;
+                punpcklbw       %%mm6,          %%mm3;
+                paddw           %%mm1,          %%mm7;
+                punpckhbw       %%mm6,          %%mm4;
+                paddw           %%mm3,          %%mm7;
+                paddw           %%mm4,          %%mm7;
+                "
+                :
+                : "b" (a), "c" (b)
+                : "st" );
+#endif
+}
 
-  if (!hx && !hy)
-    for (j=0; j<h; j++)
-    {
-      if ((v = p1[0]  - p2[0])<0)  v = -v; s+= v;
-      if ((v = p1[1]  - p2[1])<0)  v = -v; s+= v;
-      if ((v = p1[2]  - p2[2])<0)  v = -v; s+= v;
-      if ((v = p1[3]  - p2[3])<0)  v = -v; s+= v;
-      if ((v = p1[4]  - p2[4])<0)  v = -v; s+= v;
-      if ((v = p1[5]  - p2[5])<0)  v = -v; s+= v;
-      if ((v = p1[6]  - p2[6])<0)  v = -v; s+= v;
-      if ((v = p1[7]  - p2[7])<0)  v = -v; s+= v;
-      if ((v = p1[8]  - p2[8])<0)  v = -v; s+= v;
-      if ((v = p1[9]  - p2[9])<0)  v = -v; s+= v;
-      if ((v = p1[10] - p2[10])<0) v = -v; s+= v;
-      if ((v = p1[11] - p2[11])<0) v = -v; s+= v;
-      if ((v = p1[12] - p2[12])<0) v = -v; s+= v;
-      if ((v = p1[13] - p2[13])<0) v = -v; s+= v;
-      if ((v = p1[14] - p2[14])<0) v = -v; s+= v;
-      if ((v = p1[15] - p2[15])<0) v = -v; s+= v;
+unsigned int inline mmx_accum_absdiff()
+{
+	unsigned long long r = 0;
+	asm("
+		.align 8
+		movq		%%mm7,	%%mm5;
+		movq		%%mm7,	%%mm4;
+		punpcklwd	%%mm6,	%%mm4;
+		punpckhwd	%%mm6,	%%mm5;
+		paddd		%%mm5,	%%mm4;
+		movq		%%mm4,	%%mm5;
+		punpckldq	%%mm6,	%%mm5;
+		punpckhdq	%%mm6,	%%mm4;
+		paddd		%%mm5,	%%mm4;
+		movq		%%mm4,	(%%ebx);
+		femms;
+		"
+		: :  "b" (&r));
 
-      if (s >= distlim)
-        break;
+	return r;
+}
 
-      p1+= lx;
-      p2+= lx;
-    }
-  else if (hx && !hy)
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
-        if (v>=0)
-          s+= v;
-        else
-          s-= v;
-      }
-      p1+= lx;
-      p2+= lx;
-    }
-  else if (!hx && hy)
-  {
-    p1a = p1 + lx;
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
-        if (v>=0)
-          s+= v;
-        else
-          s-= v;
-      }
-      p1 = p1a;
-      p1a+= lx;
-      p2+= lx;
-    }
-  }
-  else /* if (hx && hy) */
-  {
-    p1a = p1 + lx;
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
-        if (v>=0)
-          s+= v;
-        else
-          s-= v;
-      }
-      p1 = p1a;
-      p1a+= lx;
-      p2+= lx;
-    }
-  }
+static unsigned long  MMX_AVGDIFF_1[]         = {0x00010001, 0x00010001};
 
-  return s;
+
+void inline mmx_avgdiff(unsigned char *p1, unsigned char *p2, unsigned char *p3)
+{
+#if 0
+	asm("
+		.align 8
+		movq		(%%ebx),	   %%mm0;	   // Load 8 pixels from a
+		pxor        %%mm4,         %%mm4;      // Zero out temp for unpacking a
+		movq        %%mm0,         %%mm2;      // Make a copy of a for unpacking
+		movq        (%%ecx),       %%mm1;	   // Load 8 pixels from b
+		pxor        %%mm3,         %%mm3;      // Zero out b's upper unpacked destination
+		punpcklbw   %%mm4,         %%mm2;	   // Unpack lower 4 pixels from a for addition
+		movq        %%mm1,         %%mm5;      // Copy b for unpacking
+		punpckhbw   %%mm4,         %%mm0;      // Unpack upper 4 pixels from a for addition
+		punpcklbw   %%mm3,         %%mm5;      // Unpack lower 4 pixels from b for addition
+		paddw       %%mm2,         %%mm5;      // Add lower a and lower b unpacked
+		punpckhbw   %%mm3,         %%mm1;      // Unpack upper 4 pixels from b for addition
+		paddw       %%mm0,         %%mm1;      // Add upper a and upper b unpacked
+ 		movq        (%%edx),       %%mm2;      // Load c for difference
+ 		paddw       MMX_AVGDIFF_1, %%mm5;      // Add 1 to the result of lower a + b
+		pxor        %%mm4,         %%mm4;      // Zero out temp for c unpacking
+ 		movq        %%mm2,         %%mm3;      // Make a copy of c for unpacking
+ 		paddw       MMX_AVGDIFF_1, %%mm1;      // Add 1 to the result of upper a + b
+		punpcklbw   %%mm4,         %%mm3;      // Unpack lower 4 pixels from c for subtraction
+		punpckhbw   %%mm4,         %%mm2;      // Unpack upper 4 pixels from c
+		movq        %%mm3,         %%mm0;      // Make a copy of lower c for absdiff
+		psraw       $1,            %%mm5;      // Divide result of lower a + b by 2
+		movq        %%mm2,         %%mm4;      // Make a copy of upper c for absdiff
+		psraw       $1,            %%mm1;      // Divide result of upper a + b by 2
+		psubusw     %%mm5,         %%mm3;      // Subtract lower pixels one way
+		psubusw     %%mm1,         %%mm2;      // Subtract upper pixels one way
+		psubusw     %%mm0,         %%mm5;      // Subtract lower pixels the other way
+		por         %%mm5,         %%mm3;      // Or the result of the lower pixels
+		psubusw     %%mm4,         %%mm1;      // Subtract upper pixels the other way
+		por         %%mm1,         %%mm2;      // Or the result of the upper pixels
+		paddw       %%mm3,         %%mm7;      // Accumulate lower pixels
+		paddw       %%mm2,         %%mm7;      // Accumulate upper pixels
+		"
+		:
+		: "b" (p1), "c" (p2), "d" (p3));
+#else
+	asm("
+		.align 8
+		movq		(%%ebx),	   %%mm0;	   // Load 8 pixels from a
+		movq        (%%ecx),       %%mm1;	   // Load 8 pixels from b
+		pavgusb     %%mm1,         %%mm0;      // avg of a, b in mm0
+ 		movq        (%%edx),       %%mm2;      // Load c for difference
+ 		movq	    %%mm0,         %%mm5;
+		psubusb     %%mm2,         %%mm0;      // Subtract lower pixels one way
+		psubusb	    %%mm5,	   %%mm2;
+		por         %%mm0,         %%mm2;      // Or the result of the lower pixels
+		punpcklbw   %%mm2,         %%mm3;
+		paddw       %%mm3,         %%mm7;      // Accumulate lower pixels
+		punpckhbw   %%mm2,         %%mm4;
+		paddw       %%mm4,         %%mm7;      // Accumulate upper pixels
+		"
+		:
+		: "b" (p1), "c" (p2), "d" (p3));
+#endif
+}
+
+void inline mmx_avg4diff(unsigned char *p1, unsigned char *p2, unsigned char *p3)
+{
+	asm("
+		.align 8
+		movq		(%%ebx),	   %%mm0;	   // Load 8 pixels from a
+		movq		1(%%ebx),          %%mm1
+		pavgusb     %%mm1,         %%mm0;      // avg of a, b in mm0
+		movq        (%%ecx),       %%mm2;	   // Load 8 pixels from b
+		movq        1(%%ecx),      %%mm3;
+		pavgusb	    %%mm2,         %%mm3;
+ 		movq        (%%edx),       %%mm2;      // Load c for difference
+		pavgusb     %%mm3,         %%mm0;
+		movq	    %%mm0,	   %%mm5;
+		psubusb     %%mm2,         %%mm0;      // Subtract lower pixels one way
+		psubusb	    %%mm5,	   %%mm2;
+		por         %%mm0,         %%mm2;      // Or the result of the lower pixels
+		punpcklbw   %%mm2,         %%mm3;
+		paddw       %%mm3,         %%mm7;      // Accumulate lower pixels
+		punpckhbw   %%mm2,         %%mm4;
+		paddw       %%mm4,         %%mm7;      // Accumulate upper pixels
+		"
+		:
+		: "b" (p1), "c" (p2), "d" (p3));
+}
+
+static unsigned long  MMX_ACCUM_AND[]         = {0xffffffff, 0x00000000};
+
+unsigned int inline mmx_accum_avgdiff()
+{
+	unsigned long long r = 0;
+	asm("
+		.align 8
+		pxor            %%mm5,  %%mm5;         // Clear temp for unpacking
+		movq            %%mm7,  %%mm6;         // Make a copy for unpacking
+		punpcklwd       %%mm5,  %%mm6;         // Unpack lower 2 pixels for accumulation
+		punpckhwd       %%mm5,  %%mm7;         // Unpack high 2 pixels for accumulation
+ 		paddw           %%mm6,  %%mm7;         // Add 2 doublewords in each register
+ 		movq            %%mm7,  %%mm6;         // Copy the result for a final add
+ 		pand            MMX_ACCUM_AND, %%mm7;  // And the result for accumulation
+ 		psrlq           $32,    %%mm6;         // Shift the copy right for accumulation
+ 		paddd           %%mm6,  %%mm7;         // Add the results
+ 		movq            %%mm7,  (%%ebx);       // Store result
+		femms;
+		"
+		: :  "b" (&r));
+
+	return (unsigned int)r;
+}
+
+static int dist1(blk1, blk2, lx, hx, hy, h, distlim)
+unsigned char *blk1,*blk2;
+int lx,hx,hy,h;
+int distlim;
+{
+	unsigned char *p1, *p1a, *p2;
+	int v, s;
+	int j;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+
+	if(!hx && !hy)
+	{
+#ifdef HAVE_MMX
+		mmx_start_block();
+#endif
+		for(j = 0; j < h; j++)
+		{
+#ifdef HAVE_MMX
+			mmx_absdiff(p1, p2);
+#else
+			if((v = p1[0]  - p2[0]) < 0) v = -v; s += v;
+			if((v = p1[1]  - p2[1]) < 0) v = -v; s += v;
+			if((v = p1[2]  - p2[2]) < 0) v = -v; s += v;
+			if((v = p1[3]  - p2[3]) < 0) v = -v; s += v;
+			if((v = p1[4]  - p2[4]) < 0) v = -v; s += v;
+			if((v = p1[5]  - p2[5]) < 0) v = -v; s += v;
+			if((v = p1[6]  - p2[6]) < 0) v = -v; s += v;
+			if((v = p1[7]  - p2[7]) < 0) v = -v; s += v;
+			if((v = p1[8]  - p2[8]) < 0) v = -v; s += v;
+			if((v = p1[9]  - p2[9]) < 0) v = -v; s += v;
+			if((v = p1[10] - p2[10]) < 0) v = -v; s += v;
+			if((v = p1[11] - p2[11]) < 0) v = -v; s += v;
+			if((v = p1[12] - p2[12]) < 0) v = -v; s += v;
+			if((v = p1[13] - p2[13]) < 0) v = -v; s += v;
+			if((v = p1[14] - p2[14]) < 0) v = -v; s += v;
+			if((v = p1[15] - p2[15]) < 0) v = -v; s += v;
+			if(s >= distlim) break;
+#endif
+
+			p1 += lx;
+			p2 += lx;
+		}
+#ifdef HAVE_MMX
+		s = mmx_accum_absdiff();
+#endif
+	}
+  	else 
+	if(hx && !hy)
+	{
+#ifdef HAVE_MMX
+		mmx_start_block();
+#endif
+    	for(j = 0; j < h; j++)
+    	{
+#ifdef HAVE_MMX
+			mmx_avgdiff(p1, &p1[1], p2);
+			mmx_avgdiff(&p1[8], &p1[9], &p2[8]);
+#else
+			v = ((unsigned int)(p1[0]  + p1[1]  + 1) >> 1) - p2[0];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[1]  + p1[2]  + 1) >> 1) - p2[1];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[2]  + p1[3]  + 1) >> 1) - p2[2];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[3]  + p1[4]  + 1) >> 1) - p2[3];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[4]  + p1[5]  + 1) >> 1) - p2[4];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[5]  + p1[6]  + 1) >> 1) - p2[5];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[6]  + p1[7]  + 1) >> 1) - p2[6];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[7]  + p1[8]  + 1) >> 1) - p2[7];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[8]  + p1[9]  + 1) >> 1) - p2[8];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[9]  + p1[10] + 1) >> 1) - p2[9];   if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[10] + p1[11] + 1) >> 1) - p2[10];  if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[11] + p1[12] + 1) >> 1) - p2[11];  if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[12] + p1[13] + 1) >> 1) - p2[12];  if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[13] + p1[14] + 1) >> 1) - p2[13];  if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[14] + p1[15] + 1) >> 1) - p2[14];  if(v < 0) s -= v; else s += v;
+			v = ((unsigned int)(p1[15] + p1[16] + 1) >> 1) - p2[15];  if(v < 0) s -= v; else s += v;
+#endif
+
+    		p1 += lx;
+    		p2 += lx;
+    	}
+#ifdef HAVE_MMX
+		s = mmx_accum_avgdiff();
+#endif
+	}
+	else if(!hx && hy)
+	{
+#ifdef HAVE_MMX
+		mmx_start_block();
+#endif
+    	p1a = p1 + lx;
+    	for(j = 0; j < h; j++)
+    	{
+#ifdef HAVE_MMX
+			mmx_avgdiff(p1, p1a, p2);
+			mmx_avgdiff(&p1[8], &p1a[8], &p2[8]);
+#else
+        	v = ((unsigned int)(p1[0]  + p1a[0]  + 1) >> 1) - p2[0];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[1]  + p1a[1]  + 1) >> 1) - p2[1];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[2]  + p1a[2]  + 1) >> 1) - p2[2];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[3]  + p1a[3]  + 1) >> 1) - p2[3];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[4]  + p1a[4]  + 1) >> 1) - p2[4];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[5]  + p1a[5]  + 1) >> 1) - p2[5];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[6]  + p1a[6]  + 1) >> 1) - p2[6];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[7]  + p1a[7]  + 1) >> 1) - p2[7];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[8]  + p1a[8]  + 1) >> 1) - p2[8];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[9]  + p1a[9]  + 1) >> 1) - p2[9];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[10] + p1a[10] + 1) >> 1) - p2[10]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[11] + p1a[11] + 1) >> 1) - p2[11]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[12] + p1a[12] + 1) >> 1) - p2[12]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[13] + p1a[13] + 1) >> 1) - p2[13]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[14] + p1a[14] + 1) >> 1) - p2[14]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[15] + p1a[15] + 1) >> 1) - p2[15]; if(v < 0) s -= v; else s += v;
+#endif
+
+    		p1 = p1a;
+    		p1a += lx;
+    		p2 += lx;
+    	}
+#ifdef HAVE_MMX
+		s = mmx_accum_avgdiff();
+#endif
+	}
+	else /* if (hx && hy) */
+	{
+#ifdef HAVE_MMX
+		mmx_start_block();
+#endif
+    	p1a = p1 + lx;
+    	for(j = 0; j < h; j++)
+    	{
+#ifdef HAVE_MMX
+		mmx_avg4diff(p1,p1a,p2);
+		mmx_avg4diff(&p1[8],&p1a[8],&p2[8]);
+#else
+        	v = ((unsigned int)(p1[0]  + p1[1]  + p1a[0]  + p1a[1]  + 2) >> 2) - p2[0];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[1]  + p1[2]  + p1a[1]  + p1a[3]  + 2) >> 2) - p2[1];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[2]  + p1[3]  + p1a[2]  + p1a[3]  + 2) >> 2) - p2[2];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[3]  + p1[4]  + p1a[3]  + p1a[4]  + 2) >> 2) - p2[3];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[4]  + p1[5]  + p1a[4]  + p1a[5]  + 2) >> 2) - p2[4];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[5]  + p1[6]  + p1a[5]  + p1a[6]  + 2) >> 2) - p2[5];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[6]  + p1[7]  + p1a[6]  + p1a[7]  + 2) >> 2) - p2[6];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[7]  + p1[8]  + p1a[7]  + p1a[8]  + 2) >> 2) - p2[7];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[8]  + p1[9]  + p1a[8]  + p1a[9]  + 2) >> 2) - p2[8];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[9]  + p1[10] + p1a[9]  + p1a[10] + 2) >> 2) - p2[9];  if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[10] + p1[11] + p1a[10] + p1a[11] + 2) >> 2) - p2[10]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[11] + p1[12] + p1a[11] + p1a[12] + 2) >> 2) - p2[11]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[12] + p1[13] + p1a[12] + p1a[13] + 2) >> 2) - p2[12]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[13] + p1[14] + p1a[13] + p1a[14] + 2) >> 2) - p2[13]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[14] + p1[15] + p1a[14] + p1a[15] + 2) >> 2) - p2[14]; if(v < 0) s -= v; else s += v;
+        	v = ((unsigned int)(p1[15] + p1[16] + p1a[15] + p1a[16] + 2) >> 2) - p2[15]; if(v < 0) s -= v; else s += v;
+#endif
+    		p1 = p1a;
+    		p1a += lx;
+    		p2+= lx;
+    	}
+#ifdef HAVE_MMX
+		s=mmx_accum_avgdiff();
+#endif
+	}
+  	return s;
 }
 
 /*
@@ -1444,71 +1726,121 @@
  * hx,hy:     flags for horizontal and/or vertical interpolation
  * h:         height of block (usually 8 or 16)
  */
-static int dist2(blk1,blk2,lx,hx,hy,h)
+static int dist2(blk1, blk2, lx, hx, hy, h)
 unsigned char *blk1,*blk2;
 int lx,hx,hy,h;
 {
-  unsigned char *p1,*p1a,*p2;
-  int i,j;
-  int s,v;
-
-  s = 0;
-  p1 = blk1;
-  p2 = blk2;
-  if (!hx && !hy)
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = p1[i] - p2[i];
-        s+= v*v;
-      }
-      p1+= lx;
-      p2+= lx;
-    }
-  else if (hx && !hy)
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
-        s+= v*v;
-      }
-      p1+= lx;
-      p2+= lx;
-    }
-  else if (!hx && hy)
-  {
-    p1a = p1 + lx;
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
-        s+= v*v;
-      }
-      p1 = p1a;
-      p1a+= lx;
-      p2+= lx;
-    }
-  }
-  else /* if (hx && hy) */
-  {
-    p1a = p1 + lx;
-    for (j=0; j<h; j++)
-    {
-      for (i=0; i<16; i++)
-      {
-        v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
-        s+= v*v;
-      }
-      p1 = p1a;
-      p1a+= lx;
-      p2+= lx;
-    }
-  }
+	register unsigned char *p1,*p1a,*p2;
+	register int v, s;
+	int j;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+	if(!hx && !hy)
+	{
+    	for(j = 0; j < h; j++)
+    	{
+        	v = p1[0]  - p2[0];  s += v * v;
+        	v = p1[1]  - p2[1];  s += v * v;
+        	v = p1[2]  - p2[2];  s += v * v;
+        	v = p1[3]  - p2[3];  s += v * v;
+        	v = p1[4]  - p2[4];  s += v * v;
+        	v = p1[5]  - p2[5];  s += v * v;
+        	v = p1[6]  - p2[6];  s += v * v;
+        	v = p1[7]  - p2[7];  s += v * v;
+        	v = p1[8]  - p2[8];  s += v * v;
+        	v = p1[9]  - p2[9];  s += v * v;
+        	v = p1[10] - p2[10]; s += v * v;
+        	v = p1[11] - p2[11]; s += v * v;
+        	v = p1[12] - p2[12]; s += v * v;
+        	v = p1[13] - p2[13]; s += v * v;
+        	v = p1[14] - p2[14]; s += v * v;
+        	v = p1[15] - p2[15]; s += v * v;
+    		p1 += lx;
+    		p2 += lx;
+    	}
+	}
+	else 
+	if(hx && !hy)
+	{
+    	for (j = 0; j < h; j++)
+    	{
+        	v = ((unsigned int)(p1[0] + p1[1]  + 1) >> 1) - p2[0];  s += v * v;
+        	v = ((unsigned int)(p1[1] + p1[2]  + 1) >> 1) - p2[1];  s += v * v;
+        	v = ((unsigned int)(p1[2] + p1[3]  + 1) >> 1) - p2[2];  s += v * v;
+        	v = ((unsigned int)(p1[3] + p1[4]  + 1) >> 1) - p2[3];  s += v * v;
+        	v = ((unsigned int)(p1[4] + p1[5]  + 1) >> 1) - p2[4];  s += v * v;
+        	v = ((unsigned int)(p1[5] + p1[6]  + 1) >> 1) - p2[5];  s += v * v;
+        	v = ((unsigned int)(p1[6] + p1[7]  + 1) >> 1) - p2[6];  s += v * v;
+        	v = ((unsigned int)(p1[7] + p1[8]  + 1) >> 1) - p2[7];  s += v * v;
+        	v = ((unsigned int)(p1[8] + p1[9]  + 1) >> 1) - p2[8];  s += v * v;
+        	v = ((unsigned int)(p1[9] + p1[10] + 1) >> 1) - p2[9];  s += v * v;
+        	v = ((unsigned int)(p1[10]+ p1[11] + 1) >> 1) - p2[10]; s += v * v;
+        	v = ((unsigned int)(p1[11]+ p1[12] + 1) >> 1) - p2[11]; s += v * v;
+        	v = ((unsigned int)(p1[12]+ p1[13] + 1) >> 1) - p2[12]; s += v * v;
+        	v = ((unsigned int)(p1[13]+ p1[14] + 1) >> 1) - p2[13]; s += v * v;
+        	v = ((unsigned int)(p1[14]+ p1[15] + 1) >> 1) - p2[14]; s += v * v;
+        	v = ((unsigned int)(p1[15]+ p1[16] + 1) >> 1) - p2[15]; s += v * v;
+    		p1 += lx;
+    		p2 += lx;
+    	}
+	}
+	else 
+	if(!hx && hy)
+	{
+    	p1a = p1 + lx;
+    	for(j = 0; j < h; j++)
+    	{
+        	v = ((unsigned int)(p1[0] + p1a[0] + 1) >> 1) - p2[0];  s += v * v;
+        	v = ((unsigned int)(p1[1] + p1a[1] + 1) >> 1) - p2[1];  s += v * v;
+        	v = ((unsigned int)(p1[2] + p1a[2] + 1) >> 1) - p2[2];  s += v * v;
+        	v = ((unsigned int)(p1[3] + p1a[3] + 1) >> 1) - p2[3];  s += v * v;
+        	v = ((unsigned int)(p1[4] + p1a[4] + 1) >> 1) - p2[4];  s += v * v;
+        	v = ((unsigned int)(p1[5] + p1a[5] + 1) >> 1) - p2[5];  s += v * v;
+        	v = ((unsigned int)(p1[6] + p1a[6] + 1) >> 1) - p2[6];  s += v * v;
+        	v = ((unsigned int)(p1[7] + p1a[7] + 1) >> 1) - p2[7];  s += v * v;
+        	v = ((unsigned int)(p1[8] + p1a[8] + 1) >> 1) - p2[8];  s += v * v;
+        	v = ((unsigned int)(p1[9] + p1a[9] + 1) >> 1) - p2[9];  s += v * v;
+        	v = ((unsigned int)(p1[10]+ p1a[10]+ 1) >> 1) - p2[10]; s += v * v;
+        	v = ((unsigned int)(p1[11]+ p1a[11]+ 1) >> 1) - p2[11]; s += v * v;
+        	v = ((unsigned int)(p1[12]+ p1a[12]+ 1) >> 1) - p2[12]; s += v * v;
+        	v = ((unsigned int)(p1[13]+ p1a[13]+ 1) >> 1) - p2[13]; s += v * v;
+        	v = ((unsigned int)(p1[14]+ p1a[14]+ 1) >> 1) - p2[14]; s += v * v;
+        	v = ((unsigned int)(p1[15]+ p1a[15]+ 1) >> 1) - p2[15]; s += v * v;
+    		p1 = p1a;
+    		p1a += lx;
+    		p2 += lx;
+    	}
+	}
+	else /* if (hx && hy) */
+	{
+    	p1a = p1 + lx;
+    	for(j = 0; j < h; j++)
+    	{
+        	v = ((unsigned int)(p1[0]  + p1[1]  + p1a[0]  + p1a[1]  + 2) >> 2) - p2[0];  s += v * v;
+        	v = ((unsigned int)(p1[1]  + p1[2]  + p1a[1]  + p1a[2]  + 2) >> 2) - p2[1];  s += v * v;
+        	v = ((unsigned int)(p1[2]  + p1[3]  + p1a[2]  + p1a[3]  + 2) >> 2) - p2[2];  s += v * v;
+        	v = ((unsigned int)(p1[3]  + p1[4]  + p1a[3]  + p1a[4]  + 2) >> 2) - p2[3];  s += v * v;
+        	v = ((unsigned int)(p1[4]  + p1[5]  + p1a[4]  + p1a[5]  + 2) >> 2) - p2[4];  s += v * v;
+        	v = ((unsigned int)(p1[5]  + p1[6]  + p1a[5]  + p1a[6]  + 2) >> 2) - p2[5];  s += v * v;
+        	v = ((unsigned int)(p1[6]  + p1[7]  + p1a[6]  + p1a[7]  + 2) >> 2) - p2[6];  s += v * v;
+        	v = ((unsigned int)(p1[7]  + p1[8]  + p1a[7]  + p1a[8]  + 2) >> 2) - p2[7];  s += v * v;
+        	v = ((unsigned int)(p1[8]  + p1[9]  + p1a[8]  + p1a[9]  + 2) >> 2) - p2[8];  s += v * v;
+        	v = ((unsigned int)(p1[9]  + p1[10] + p1a[9]  + p1a[10] + 2) >> 2) - p2[9];  s += v * v;
+        	v = ((unsigned int)(p1[10] + p1[11] + p1a[10] + p1a[11] + 2) >> 2) - p2[10]; s += v * v;
+        	v = ((unsigned int)(p1[11] + p1[12] + p1a[11] + p1a[12] + 2) >> 2) - p2[11]; s += v * v;
+        	v = ((unsigned int)(p1[12] + p1[13] + p1a[12] + p1a[13] + 2) >> 2) - p2[12]; s += v * v;
+        	v = ((unsigned int)(p1[13] + p1[14] + p1a[13] + p1a[14] + 2) >> 2) - p2[13]; s += v * v;
+        	v = ((unsigned int)(p1[14] + p1[15] + p1a[14] + p1a[15] + 2) >> 2) - p2[14]; s += v * v;
+        	v = ((unsigned int)(p1[15] + p1[16] + p1a[15] + p1a[16] + 2) >> 2) - p2[15]; s += v * v;
+    		p1 = p1a;
+    		p1a += lx;
+    		p2 += lx;
+    	}
+	}
 
-  return s;
+	return s;
 }
 
 /*
@@ -1579,41 +1911,98 @@
 unsigned char *pf,*pb,*p2;
 int lx,hxf,hyf,hxb,hyb,h;
 {
-  unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
-  int i,j;
-  int s,v;
-
-  pfa = pf + hxf;
-  pfb = pf + lx*hyf;
-  pfc = pfb + hxf;
-
-  pba = pb + hxb;
-  pbb = pb + lx*hyb;
-  pbc = pbb + hxb;
-
-  s = 0;
-
-  for (j=0; j<h; j++)
-  {
-    for (i=0; i<16; i++)
-    {
-      v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
-            ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
-          - *p2++;
-      s+=v*v;
-    }
-    p2+= lx-16;
-    pf+= lx-16;
-    pfa+= lx-16;
-    pfb+= lx-16;
-    pfc+= lx-16;
-    pb+= lx-16;
-    pba+= lx-16;
-    pbb+= lx-16;
-    pbc+= lx-16;
-  }
-
-  return s;
+	register unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+	int j;
+	register int i, s, v;
+
+	pfa = pf + hxf;
+	pfb = pf + lx*hyf;
+	pfc = pfb + hxf;
+
+	pba = pb + hxb;
+	pbb = pb + lx*hyb;
+	pbc = pbb + hxb;
+
+	s = 0;
+
+	for(j = 0; j < h; j++)
+	{
+		v = ((((unsigned int)(pf[0] + pfa[0] + pfb[0] + pfc[0] + 2) >> 2) +
+            ((unsigned int)(pb[0] + pba[0] + pbb[0] + pbc[0] + 2) >> 2) + 1) >> 1) - 
+			p2[0];
+		s += v * v;
+		v = ((((unsigned int)(pf[1] + pfa[1] + pfb[1] + pfc[1] + 2) >> 2) +
+            ((unsigned int)(pb[1] + pba[1] + pbb[1] + pbc[1] + 2) >> 2) + 1) >> 1) - 
+			p2[1];
+		s += v * v;
+		v = ((((unsigned int)(pf[2] + pfa[2] + pfb[2] + pfc[2] + 2) >> 2) +
+            ((unsigned int)(pb[2] + pba[2] + pbb[2] + pbc[2] + 2) >> 2) + 1) >> 1) - 
+			p2[2];
+		s += v * v;
+		v = ((((unsigned int)(pf[3] + pfa[3] + pfb[3] + pfc[3] + 2) >> 2) +
+            ((unsigned int)(pb[3] + pba[3] + pbb[3] + pbc[3] + 2) >> 2) + 1) >> 1) - 
+			p2[3];
+		s += v * v;
+		v = ((((unsigned int)(pf[4] + pfa[4] + pfb[4] + pfc[4] + 2) >> 2) +
+            ((unsigned int)(pb[4] + pba[4] + pbb[4] + pbc[4] + 2) >> 2) + 1) >> 1) - 
+			p2[4];
+		s += v * v;
+		v = ((((unsigned int)(pf[5] + pfa[5] + pfb[5] + pfc[5] + 2) >> 2) +
+            ((unsigned int)(pb[5] + pba[5] + pbb[5] + pbc[5] + 2) >> 2) + 1) >> 1) - 
+			p2[5];
+		s += v * v;
+		v = ((((unsigned int)(pf[6] + pfa[6] + pfb[6] + pfc[6] + 2) >> 2) +
+            ((unsigned int)(pb[6] + pba[6] + pbb[6] + pbc[6] + 2) >> 2) + 1) >> 1) - 
+			p2[6];
+		s += v * v;
+		v = ((((unsigned int)(pf[7] + pfa[7] + pfb[7] + pfc[7] + 2) >> 2) +
+            ((unsigned int)(pb[7] + pba[7] + pbb[7] + pbc[7] + 2) >> 2) + 1) >> 1) - 
+			p2[7];
+		s += v * v;
+		v = ((((unsigned int)(pf[8] + pfa[8] + pfb[8] + pfc[8] + 2) >> 2) +
+            ((unsigned int)(pb[8] + pba[8] + pbb[8] + pbc[8] + 2) >> 2) + 1) >> 1) - 
+			p2[8];
+		s += v * v;
+		v = ((((unsigned int)(pf[9] + pfa[9] + pfb[9] + pfc[9] + 2) >> 2) +
+            ((unsigned int)(pb[9] + pba[9] + pbb[9] + pbc[9] + 2) >> 2) + 1) >> 1) - 
+			p2[9];
+		s += v * v;
+		v = ((((unsigned int)(pf[10] + pfa[10] + pfb[10] + pfc[10] + 2) >> 2) +
+            ((unsigned int)(pb[10] + pba[10] + pbb[10] + pbc[10] + 2) >> 2) + 1) >> 1) - 
+			p2[10];
+		s += v * v;
+		v = ((((unsigned int)(pf[11] + pfa[11] + pfb[11] + pfc[11] + 2) >> 2) +
+            ((unsigned int)(pb[11] + pba[11] + pbb[11] + pbc[11] + 2) >> 2) + 1) >> 1) - 
+			p2[11];
+		s += v * v;
+		v = ((((unsigned int)(pf[12] + pfa[12] + pfb[12] + pfc[12] + 2) >> 2) +
+            ((unsigned int)(pb[12] + pba[12] + pbb[12] + pbc[12] + 2) >> 2) + 1) >> 1) - 
+			p2[12];
+		s += v * v;
+		v = ((((unsigned int)(pf[13] + pfa[13] + pfb[13] + pfc[13] + 2) >> 2) +
+            ((unsigned int)(pb[13] + pba[13] + pbb[13] + pbc[13] + 2) >> 2) + 1) >> 1) - 
+			p2[13];
+		s += v * v;
+		v = ((((unsigned int)(pf[14] + pfa[14] + pfb[14] + pfc[14] + 2) >> 2) +
+            ((unsigned int)(pb[14] + pba[14] + pbb[14] + pbc[14] + 2) >> 2) + 1) >> 1) - 
+			p2[14];
+		s += v * v;
+		v = ((((unsigned int)(pf[15] + pfa[15] + pfb[15] + pfc[15] + 2) >> 2) +
+            ((unsigned int)(pb[15] + pba[15] + pbb[15] + pbc[15] + 2) >> 2) + 1) >> 1) - 
+			p2[15];
+		s += v * v;
+
+    	p2 += lx;
+    	pf += lx;
+    	pfa += lx;
+    	pfb += lx;
+    	pfc += lx;
+    	pb += lx;
+    	pba += lx;
+    	pbb += lx;
+    	pbc += lx;
+	}
+  	return s;
 }
 
 /*
@@ -1621,24 +2010,34 @@
  * p:  address of top left pel of block
  * lx: distance (in bytes) of vertically adjacent pels
  */
-static int variance(p,lx)
+static int variance(p, lx)
 unsigned char *p;
 int lx;
 {
-  int i,j;
-  unsigned int v,s,s2;
+	int i, j;
+	register unsigned int v, s, s2;
 
-  s = s2 = 0;
+	s = s2 = 0;
 
-  for (j=0; j<16; j++)
-  {
-    for (i=0; i<16; i++)
-    {
-      v = *p++;
-      s+= v;
-      s2+= v*v;
-    }
-    p+= lx-16;
-  }
-  return s2 - (s*s)/256;
+	for (j=0; j<16; j++)
+	{
+		v = p[0]; s += v; s2 += v * v;
+		v = p[1]; s += v; s2 += v * v;
+		v = p[2]; s += v; s2 += v * v;
+		v = p[3]; s += v; s2 += v * v;
+		v = p[4]; s += v; s2 += v * v;
+		v = p[5]; s += v; s2 += v * v;
+		v = p[6]; s += v; s2 += v * v;
+		v = p[7]; s += v; s2 += v * v;
+		v = p[8]; s += v; s2 += v * v;
+		v = p[9]; s += v; s2 += v * v;
+		v = p[10]; s += v; s2 += v * v;
+		v = p[11]; s += v; s2 += v * v;
+		v = p[12]; s += v; s2 += v * v;
+		v = p[13]; s += v; s2 += v * v;
+		v = p[14]; s += v; s2 += v * v;
+		v = p[15]; s += v; s2 += v * v;
+    	p += lx;
+	}
+	return s2 - (s*s)/256;
 }
Binary files mpeg2enc.orig/motion.o and mpeg2enc/motion.o differ
Binary files mpeg2enc.orig/mpeg2enc.o and mpeg2enc/mpeg2enc.o differ
Binary files mpeg2enc.orig/mpeg2encode and mpeg2enc/mpeg2encode differ
Only in mpeg2enc: mpeg2encode.ref
Binary files mpeg2enc.orig/predict.o and mpeg2enc/predict.o differ
Only in mpeg2enc: profrep1
Only in mpeg2enc: profrep2
Only in mpeg2enc: profrep3
Only in mpeg2enc: profrep4
Only in mpeg2enc: profrep5
Only in mpeg2enc: profrep6
Binary files mpeg2enc.orig/putbits.o and mpeg2enc/putbits.o differ
Binary files mpeg2enc.orig/puthdr.o and mpeg2enc/puthdr.o differ
Binary files mpeg2enc.orig/putmpg.o and mpeg2enc/putmpg.o differ
Binary files mpeg2enc.orig/putpic.o and mpeg2enc/putpic.o differ
Binary files mpeg2enc.orig/putseq.o and mpeg2enc/putseq.o differ
Binary files mpeg2enc.orig/putvlc.o and mpeg2enc/putvlc.o differ
Binary files mpeg2enc.orig/quantize.o and mpeg2enc/quantize.o differ
Only in mpeg2enc: r.diff
Only in mpeg2enc: ratectl.c.mmx
Only in mpeg2enc: ratectl.c.orig
Only in mpeg2enc: ratectl.c.rej
Binary files mpeg2enc.orig/ratectl.o and mpeg2enc/ratectl.o differ
Binary files mpeg2enc.orig/readavi.o and mpeg2enc/readavi.o differ
Binary files mpeg2enc.orig/readpic.o and mpeg2enc/readpic.o differ
Binary files mpeg2enc.orig/stats.o and mpeg2enc/stats.o differ
Only in mpeg2enc.orig: test.m1v
Only in mpeg2enc: test.m1v.ref
Only in mpeg2enc.orig: test.mpg
Only in mpeg2enc: testabsdiff
Only in mpeg2enc: testabsdiff.c
Binary files mpeg2enc.orig/transfrm.o and mpeg2enc/transfrm.o differ
Only in mpeg2enc: x
Only in mpeg2enc: xx
Only in mpeg2enc: xxx
Only in mpeg2enc: zwei
