This patch splits up the inline assembly in transpose4x4() because otherwise
some versions of gcc (at least Debian's 4.3.2) aren't able to handle the amount
of variables to assign registers to when compiling this code with -fPIC.

Signed-Off: Mathias Krause <mathias.krause@secunet.com>

diff -Nrup a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
--- a/libavcodec/x86/dsputil_mmx.c	2010-04-16 22:04:30.000000000 +0200
+++ b/libavcodec/x86/dsputil_mmx.c	2010-09-03 21:14:43.000000000 +0200
@@ -725,15 +725,23 @@ static void h263_v_loop_filter_mmx(uint8
 
 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
     __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %4, %%mm0                \n\t"
-        "movd  %5, %%mm1                \n\t"
-        "movd  %6, %%mm2                \n\t"
-        "movd  %7, %%mm3                \n\t"
+        "movd  %0, %%mm0                \n\t"
+        "movd  %1, %%mm1                \n\t"
+        "movd  %2, %%mm2                \n\t"
+        "movd  %3, %%mm3                \n\t"
         "punpcklbw %%mm1, %%mm0         \n\t"
         "punpcklbw %%mm3, %%mm2         \n\t"
         "movq %%mm0, %%mm1              \n\t"
         "punpcklwd %%mm2, %%mm0         \n\t"
         "punpckhwd %%mm2, %%mm1         \n\t"
+		: /* nothing */
+        :  "m" (*(uint32_t*)(src + 0*src_stride)),
+           "m" (*(uint32_t*)(src + 1*src_stride)),
+           "m" (*(uint32_t*)(src + 2*src_stride)),
+           "m" (*(uint32_t*)(src + 3*src_stride))
+    );
+
+    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
         "movd  %%mm0, %0                \n\t"
         "punpckhdq %%mm0, %%mm0         \n\t"
         "movd  %%mm0, %1                \n\t"
@@ -745,10 +753,6 @@ static inline void transpose4x4(uint8_t 
           "=m" (*(uint32_t*)(dst + 1*dst_stride)),
           "=m" (*(uint32_t*)(dst + 2*dst_stride)),
           "=m" (*(uint32_t*)(dst + 3*dst_stride))
-        :  "m" (*(uint32_t*)(src + 0*src_stride)),
-           "m" (*(uint32_t*)(src + 1*src_stride)),
-           "m" (*(uint32_t*)(src + 2*src_stride)),
-           "m" (*(uint32_t*)(src + 3*src_stride))
     );
 }