| | 166 | |
|---|
| | 167 | #if defined (__SSE2__) && !defined (__sun__) |
|---|
| | 168 | |
|---|
| | 169 | /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster */ |
|---|
| | 170 | static inline __m128 gen_one(void) |
|---|
| | 171 | { |
|---|
| | 172 | volatile __m128i x; |
|---|
| | 173 | __m128i ones = _mm_cmpeq_epi32(x, x); |
|---|
| | 174 | return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23); |
|---|
| | 175 | } |
|---|
| | 176 | |
|---|
| | 177 | static inline __m128 clip(__m128 s, __m128 min, __m128 max) |
|---|
| | 178 | { |
|---|
| | 179 | return _mm_min_ps(max, _mm_max_ps(s, min)); |
|---|
| | 180 | } |
|---|
| | 181 | |
|---|
| | 182 | static inline __m128i float_24_sse(__m128 s) |
|---|
| | 183 | { |
|---|
| | 184 | const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */ |
|---|
| | 185 | const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound); |
|---|
| | 186 | |
|---|
| | 187 | __m128 clipped = clip(s, lower_bound, upper_bound); |
|---|
| | 188 | __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING)); |
|---|
| | 189 | return _mm_cvtps_epi32(scaled); |
|---|
| | 190 | } |
|---|
| | 191 | #endif |
|---|
| 267 | | dst += dst_skip; |
|---|
| 268 | | _mm_store_ss((float*)dst, (__m128)shuffled1); |
|---|
| 269 | | dst += dst_skip; |
|---|
| 270 | | _mm_store_ss((float*)dst, (__m128)shuffled2); |
|---|
| 271 | | dst += dst_skip; |
|---|
| 272 | | _mm_store_ss((float*)dst, (__m128)shuffled3); |
|---|
| 273 | | dst += dst_skip; |
|---|
| | 293 | |
|---|
| | 294 | _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1); |
|---|
| | 295 | _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2); |
|---|
| | 296 | _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3); |
|---|
| | 297 | dst += 4*dst_skip; |
|---|
| 392 | | int32_t z; |
|---|
| 393 | | |
|---|
| | 416 | #if defined (__SSE2__) && !defined (__sun__) |
|---|
| | 417 | _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); |
|---|
| | 418 | while (nsamples >= 4) { |
|---|
| | 419 | int i; |
|---|
| | 420 | int32_t z[4]; |
|---|
| | 421 | __m128 samples = _mm_loadu_ps(src); |
|---|
| | 422 | __m128i converted = float_24_sse(samples); |
|---|
| | 423 | |
|---|
| | 424 | __m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1)); |
|---|
| | 425 | __m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2)); |
|---|
| | 426 | __m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3)); |
|---|
| | 427 | |
|---|
| | 428 | _mm_store_ss((float*)z, (__m128)converted); |
|---|
| | 429 | _mm_store_ss((float*)z+1, (__m128)shuffled1); |
|---|
| | 430 | _mm_store_ss((float*)z+2, (__m128)shuffled2); |
|---|
| | 431 | _mm_store_ss((float*)z+3, (__m128)shuffled3); |
|---|
| | 432 | |
|---|
| | 433 | for (i = 0; i != 4; ++i) { |
|---|
| | 434 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
|---|
| | 435 | memcpy (dst, z+i, 3); |
|---|
| | 436 | #elif __BYTE_ORDER == __BIG_ENDIAN |
|---|
| | 437 | memcpy (dst, (float*)((char *)&z + 1)+i, 3); |
|---|
| | 438 | #endif |
|---|
| | 439 | dst += dst_skip; |
|---|
| | 440 | } |
|---|
| | 441 | nsamples -= 4; |
|---|
| | 442 | src += 4; |
|---|
| | 443 | } |
|---|
| | 444 | #endif |
|---|
| | 445 | |
|---|
| | 446 | int32_t z; |
|---|
| | 447 | |
|---|