__m128i line[5]; __m128i pp; for (j=-2; j<=2; j++) line[2+j] = _mm_loadu_si128((const __m128i *)&p[i+j*w-2]); /* -> higher addresses, shifting "left" * 0 1 2 3 4 5 6 7 8 9 A B C D E F * line[0] x x x x x x x x x x x x x x x x * line[1] x x x x x x x x x x x x x x x x * line[2] x x a b c d e f g h i j k l x x * line[3] x x x x x x x x x x x x x x x x * line[4] x x x x x x x x x x x x x x x x */ #if 1 __m128i store = _mm_setzero_si128(); for (k=0; k<6; k++) { /* process 2, 8 */ pp = line[2]; /* x 0 a 0 c 0 e 0 g 0 i 0 k 0 x 0 */ pp = _mm_and_si128(pp, _mm_set1_epi16(0xff)); /* x 0 a 0 c 0 e 0 g 0 g 0 g 0 x 0 */ pp = _mm_shufflehi_epi16(pp, 0 << 0 | 0 << 2 | 0 << 4 | 3 << 6); /* x 0 a 0 a 0 a 0 g 0 g 0 g 0 x 0 */ pp = _mm_shufflelo_epi16(pp, 1 << 2 | 1 << 4 | 1 << 6); /* a 0 a 0 a 0 g 0 g 0 g 0 x 0 0 0 */ pp = _mm_srli_si128(pp, 2); /* 0 1 2 3 4 5 6 7 8 9 A B C D E F * a a a a a a g g g g g g x x 0 0 * --------- --------- * valid valid */ pp = _mm_or_si128(pp, _mm_slli_si128(pp, 1)); /* 3 lower dwords contain valid values and need to be added horizontally */ __m128i avg0 = _mm_setzero_si128(); __m128i avg1 = _mm_setzero_si128(); /* 5 lower words contain valid values and need to be added horizontally */ __m128i cnt0 = _mm_setzero_si128(); __m128i cnt1 = _mm_setzero_si128(); for (j=0; j<5; j++) { __m128i ad = _mm_or_si128( _mm_subs_epu8(line[j], pp), _mm_subs_epu8(pp, line[j])); __m128i q = _mm_subs_epu8(lvl, ad); /* otherwise the madd produces nonsense for the third dword */ q = _mm_and_si128(q, (__m128i)(__v16qi){ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }); __m128i q0 = _mm_unpacklo_epi8(q, _mm_setzero_si128()); /* a */ __m128i q1 = _mm_unpacklo_epi8(_mm_srli_si128(q, 6), _mm_setzero_si128()); /* g */ cnt0 = _mm_add_epi16(cnt0, q0); cnt1 = _mm_add_epi16(cnt1, q1); avg0 = _mm_add_epi32(avg0, _mm_madd_epi16(q0, _mm_unpacklo_epi8(line[j], _mm_setzero_si128()))); avg1 = _mm_add_epi32(avg1, _mm_madd_epi16(q1, _mm_unpacklo_epi8(_mm_srli_si128(line[j], 6), _mm_setzero_si128()))); } avg0 = _mm_slli_epi32(avg0, 1); avg0 = _mm_add_epi32(avg0, _mm_srli_si128(avg0, 8)); avg0 = _mm_add_epi32(avg0, _mm_srli_si128(avg0, 4)); avg1 = _mm_slli_epi32(avg1, 1); avg1 = _mm_add_epi32(avg1, _mm_srli_si128(avg1, 8)); avg1 = _mm_add_epi32(avg1, _mm_srli_si128(avg1, 4)); cnt0 = _mm_add_epi16(cnt0, _mm_srli_si128(cnt0, 8)); cnt0 = _mm_add_epi16(cnt0, _mm_srli_si128(cnt0, 4)); cnt0 = _mm_add_epi16(cnt0, _mm_srli_si128(cnt0, 2)); cnt1 = _mm_add_epi16(cnt1, _mm_srli_si128(cnt1, 8)); cnt1 = _mm_add_epi16(cnt1, _mm_srli_si128(cnt1, 4)); cnt1 = _mm_add_epi16(cnt1, _mm_srli_si128(cnt1, 2)); avg = _mm_cvtsi128_si32(avg0); cnt = _mm_cvtsi128_si32(cnt0) & 0xffff; avg = _mm_cvtsi128_si32(avg1); cnt = _mm_cvtsi128_si32(cnt1) & 0xffff; // d[i+k] = (avg / cnt + 1) / 2; unsigned char dd0 = (avg / cnt + 1) / 2; // d[i+k+6] = (avg / cnt + 1) / 2; unsigned char dd1 = (avg / cnt + 1) / 2; __m128i r = _mm_set_epi64x(dd1 << (k * 8), dd0 << ((k + 2) * 8)); store = _mm_or_si128(store, r); for (j=0; j<5; j++) line[j] = _mm_srli_si128(line[j], 1); } _mm_storeu_si128((__m128i *)&d[i], _mm_srli_si128(store, 2));