diff options
| -rw-r--r-- | host/lib/convert/sse2_fc32_to_sc16.cpp | 161 | ||||
| -rw-r--r-- | host/lib/convert/sse2_fc32_to_sc8.cpp | 127 | ||||
| -rw-r--r-- | host/lib/convert/sse2_fc64_to_sc16.cpp | 138 | ||||
| -rw-r--r-- | host/lib/convert/sse2_fc64_to_sc8.cpp | 153 | ||||
| -rw-r--r-- | host/lib/convert/sse2_sc16_to_fc32.cpp | 162 | ||||
| -rw-r--r-- | host/lib/convert/sse2_sc16_to_fc64.cpp | 140 | ||||
| -rw-r--r-- | host/lib/convert/sse2_sc16_to_sc16.cpp | 240 | ||||
| -rw-r--r-- | host/lib/convert/sse2_sc8_to_fc32.cpp | 132 | ||||
| -rw-r--r-- | host/lib/convert/sse2_sc8_to_fc64.cpp | 168 | 
9 files changed, 742 insertions, 679 deletions
diff --git a/host/lib/convert/sse2_fc32_to_sc16.cpp b/host/lib/convert/sse2_fc32_to_sc16.cpp index f562074c6..2d1f853b9 100644 --- a/host/lib/convert/sse2_fc32_to_sc16.cpp +++ b/host/lib/convert/sse2_fc32_to_sc16.cpp @@ -1,6 +1,7 @@  //  // Copyright 2011-2012 Ettus Research LLC  // Copyright 2018 Ettus Research, a National Instruments Company +// Copyright 2019 Ettus Research, a National Instruments Brand  //  // SPDX-License-Identifier: GPL-3.0-or-later  // @@ -11,101 +12,111 @@  using namespace uhd::convert; -DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ -    const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128 scalar = _mm_set_ps1(float(scale_factor)); -    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time -    #define convert_fc32_1_to_item32_1_nswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ -        __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));    \ -        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));    \ -                                                                        \ -        /* pack + swap 16-bit pairs */                                  \ -        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                 \ -        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ -        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);  \ -    }                                                                   \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_fc32_1_to_item32_1_nswap_guts(_al_)                            \ +    for (; i + 3 < nsamps; i += 4) {                                           \ +        /* load from input */                                                  \ +        __m128 tmplo =                                                         \ +            _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ +        __m128 tmphi =                                                         \ +            _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ +                                                                               \ +        /* convert and scale */                                                \ +        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));           \ +        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));           \ +                                                                               \ +        /* pack + swap 16-bit pairs */                                         \ +        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                        \ +        tmpi         = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));     \ +        tmpi         = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));     \ +                                                                               \ +        /* store to output */                                                  \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi);        \ +    }      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(input) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        convert_fc32_1_to_item32_1_nswap_guts(_) -        break; -    case 0x8: -        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes -        xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); -        i++; -        // do faster processing of the bulk of the samples now that we are 16-byte aligned -        convert_fc32_1_to_item32_1_nswap_guts(_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load -        convert_fc32_1_to_item32_1_nswap_guts(u_) +    switch (size_t(input) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            convert_fc32_1_to_item32_1_nswap_guts(_) break; +        case 0x8: +            // the first sample is 8-byte aligned - process it to align the remainder of +            // the samples to 16-bytes +            xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); +            i++; +            // do faster processing of the bulk of the samples now that we are 16-byte +            // aligned +            convert_fc32_1_to_item32_1_nswap_guts(_) break; +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load +            convert_fc32_1_to_item32_1_nswap_guts(u_)      }      // convert any remaining samples -    xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); +    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);  } -DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){ -    const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128 scalar = _mm_set_ps1(float(scale_factor)); -    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time -    #define convert_fc32_1_to_item32_1_bswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ -        __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));    \ -        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));    \ -                                                                        \ -        /* pack + byteswap -> byteswap 16 bit words */                  \ -        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                 \ -        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);  \ -    }                                                                   \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_fc32_1_to_item32_1_bswap_guts(_al_)                                    \ +    for (; i + 3 < nsamps; i += 4) {                                                   \ +        /* load from input */                                                          \ +        __m128 tmplo =                                                                 \ +            _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));         \ +        __m128 tmphi =                                                                 \ +            _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2));         \ +                                                                                       \ +        /* convert and scale */                                                        \ +        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));                   \ +        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));                   \ +                                                                                       \ +        /* pack + byteswap -> byteswap 16 bit words */                                 \ +        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                                \ +        tmpi         = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ +                                                                                       \ +        /* store to output */                                                          \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi);                \ +    }      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(input) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        convert_fc32_1_to_item32_1_bswap_guts(_) -        break; -    case 0x8: -        // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion -        xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); -        i++; -        // do faster processing of the remaining samples now that we are 16-byte aligned -        convert_fc32_1_to_item32_1_bswap_guts(_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load -        convert_fc32_1_to_item32_1_bswap_guts(u_) +    switch (size_t(input) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            convert_fc32_1_to_item32_1_bswap_guts(_) break; +        case 0x8: +            // the first value is 8-byte aligned - process it and prepare the bulk of the +            // data for fast conversion +            xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); +            i++; +            // do faster processing of the remaining samples now that we are 16-byte +            // aligned +            convert_fc32_1_to_item32_1_bswap_guts(_) break; +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load +            convert_fc32_1_to_item32_1_bswap_guts(u_)      }      // convert any remaining samples -    xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); +    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);  } diff --git a/host/lib/convert/sse2_fc32_to_sc8.cpp b/host/lib/convert/sse2_fc32_to_sc8.cpp index b3f96ea39..66faa82cc 100644 --- a/host/lib/convert/sse2_fc32_to_sc8.cpp +++ b/host/lib/convert/sse2_fc32_to_sc8.cpp @@ -12,94 +12,95 @@  using namespace uhd::convert;  template <const int shuf> -UHD_INLINE __m128i pack_sc32_4x( -    const __m128 &in0, const __m128 &in1, -    const __m128 &in2, const __m128 &in3, -    const __m128 &scalar -){ -    __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar)); -    tmpi0 = _mm_shuffle_epi32(tmpi0, shuf); -    __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar)); -    tmpi1 = _mm_shuffle_epi32(tmpi1, shuf); +UHD_INLINE __m128i pack_sc32_4x(const __m128& in0, +    const __m128& in1, +    const __m128& in2, +    const __m128& in3, +    const __m128& scalar) +{ +    __m128i tmpi0    = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar)); +    tmpi0            = _mm_shuffle_epi32(tmpi0, shuf); +    __m128i tmpi1    = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar)); +    tmpi1            = _mm_shuffle_epi32(tmpi1, shuf);      const __m128i lo = _mm_packs_epi32(tmpi0, tmpi1); -    __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar)); -    tmpi2 = _mm_shuffle_epi32(tmpi2, shuf); -    __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar)); -    tmpi3 = _mm_shuffle_epi32(tmpi3, shuf); +    __m128i tmpi2    = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar)); +    tmpi2            = _mm_shuffle_epi32(tmpi2, shuf); +    __m128i tmpi3    = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar)); +    tmpi3            = _mm_shuffle_epi32(tmpi3, shuf);      const __m128i hi = _mm_packs_epi32(tmpi2, tmpi3);      return _mm_packs_epi16(lo, hi);  } -DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD){ -    const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD) +{ +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128 scalar = _mm_set_ps1(float(scale_factor)); -    const int shuf = _MM_SHUFFLE(3, 2, 1, 0); - -    #define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_)             \ -    for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){                       \ -        /* load from input */                                           \ -        __m128 tmp0 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ -        __m128 tmp1 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ -        __m128 tmp2 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+4)); \ -        __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \ -                                                                        \ -        /* convert */                                                   \ -        const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi);  \ -    }                                                                   \ +    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0); + +#define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_)                                  \ +    for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) {                                 \ +        /* load from input */                                                            \ +        __m128 tmp0 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ +        __m128 tmp1 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ +        __m128 tmp2 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ +        __m128 tmp3 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 6)); \ +                                                                                         \ +        /* convert */                                                                    \ +        const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);         \ +                                                                                         \ +        /* store to output */                                                            \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi);                  \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(input) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(input) & 0xf) == 0) {          convert_fc32_1_to_sc8_item32_1_bswap_guts(_) -    } -    else{ +    } else {          convert_fc32_1_to_sc8_item32_1_bswap_guts(u_)      } -    //convert remainder -    xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor); +    // convert remainder +    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);  } -DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD){ -    const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD) +{ +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128 scalar = _mm_set_ps1(float(scale_factor)); -    const int shuf = _MM_SHUFFLE(0, 1, 2, 3); - -    #define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_)             \ -    for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){                       \ -        /* load from input */                                           \ -        __m128 tmp0 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ -        __m128 tmp1 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ -        __m128 tmp2 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+4)); \ -        __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \ -                                                                        \ -        /* convert */                                                   \ -        const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi);  \ -    }                                                                   \ +    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3); + +#define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_)                                  \ +    for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) {                                 \ +        /* load from input */                                                            \ +        __m128 tmp0 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ +        __m128 tmp1 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ +        __m128 tmp2 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ +        __m128 tmp3 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 6)); \ +                                                                                         \ +        /* convert */                                                                    \ +        const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);         \ +                                                                                         \ +        /* store to output */                                                            \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi);                  \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(input) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(input) & 0xf) == 0) {          convert_fc32_1_to_sc8_item32_1_nswap_guts(_) -    } -    else{ +    } else {          convert_fc32_1_to_sc8_item32_1_nswap_guts(u_)      } -    //convert remainder -    xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor); +    // convert remainder +    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);  } diff --git a/host/lib/convert/sse2_fc64_to_sc16.cpp b/host/lib/convert/sse2_fc64_to_sc16.cpp index 2004c1fd7..7c2ce1f8e 100644 --- a/host/lib/convert/sse2_fc64_to_sc16.cpp +++ b/host/lib/convert/sse2_fc64_to_sc16.cpp @@ -11,91 +11,99 @@  using namespace uhd::convert; -DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD){ -    const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ +    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128d scalar = _mm_set1_pd(scale_factor); -    #define convert_fc64_1_to_item32_1_nswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ -        __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ -        __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ -        __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar));     \ -        __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar));     \ -        __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1);              \ -        __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar));     \ -        __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar));     \ -        __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3);              \ -                                                                        \ -        /* pack + swap 16-bit pairs */                                  \ -        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                 \ -        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ -        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);  \ -    }                                                                   \ +#define convert_fc64_1_to_item32_1_nswap_guts(_al_)                             \ +    for (; i + 3 < nsamps; i += 4) {                                            \ +        /* load from input */                                                   \ +        __m128d tmp0 =                                                          \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \ +        __m128d tmp1 =                                                          \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \ +        __m128d tmp2 =                                                          \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \ +        __m128d tmp3 =                                                          \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \ +                                                                                \ +        /* convert and scale */                                                 \ +        __m128i tmpi0  = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar));            \ +        __m128i tmpi1  = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar));            \ +        __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1);                      \ +        __m128i tmpi2  = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar));            \ +        __m128i tmpi3  = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar));            \ +        __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3);                      \ +                                                                                \ +        /* pack + swap 16-bit pairs */                                          \ +        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                         \ +        tmpi         = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ +        tmpi         = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ +                                                                                \ +        /* store to output */                                                   \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi);         \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(input) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(input) & 0xf) == 0) {          convert_fc64_1_to_item32_1_nswap_guts(_) -    } -    else{ +    } else {          convert_fc64_1_to_item32_1_nswap_guts(u_)      } -    //convert remainder -    xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); +    // convert remainder +    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);  } -DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD){ -    const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ +    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128d scalar = _mm_set1_pd(scale_factor); -    #define convert_fc64_1_to_item32_1_bswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ -        __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ -        __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ -        __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar));     \ -        __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar));     \ -        __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1);              \ -        __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar));     \ -        __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar));     \ -        __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3);              \ -                                                                        \ -        /* pack + byteswap -> byteswap 16 bit words */                  \ -        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                 \ -        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);  \ -    }                                                                   \ +#define convert_fc64_1_to_item32_1_bswap_guts(_al_)                                    \ +    for (; i + 3 < nsamps; i += 4) {                                                   \ +        /* load from input */                                                          \ +        __m128d tmp0 =                                                                 \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));        \ +        __m128d tmp1 =                                                                 \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1));        \ +        __m128d tmp2 =                                                                 \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));        \ +        __m128d tmp3 =                                                                 \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3));        \ +                                                                                       \ +        /* convert and scale */                                                        \ +        __m128i tmpi0  = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar));                   \ +        __m128i tmpi1  = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar));                   \ +        __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1);                             \ +        __m128i tmpi2  = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar));                   \ +        __m128i tmpi3  = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar));                   \ +        __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3);                             \ +                                                                                       \ +        /* pack + byteswap -> byteswap 16 bit words */                                 \ +        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                                \ +        tmpi         = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ +                                                                                       \ +        /* store to output */                                                          \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi);                \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(input) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(input) & 0xf) == 0) {          convert_fc64_1_to_item32_1_bswap_guts(_) -    } -    else{ +    } else {          convert_fc64_1_to_item32_1_bswap_guts(u_)      } -    //convert remainder -    xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); +    // convert remainder +    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);  } diff --git a/host/lib/convert/sse2_fc64_to_sc8.cpp b/host/lib/convert/sse2_fc64_to_sc8.cpp index 455ca95e3..95db4e927 100644 --- a/host/lib/convert/sse2_fc64_to_sc8.cpp +++ b/host/lib/convert/sse2_fc64_to_sc8.cpp @@ -12,108 +12,119 @@  using namespace uhd::convert;  UHD_INLINE __m128i pack_sc8_item32_4x( -    const __m128i &in0, const __m128i &in1, -    const __m128i &in2, const __m128i &in3 -){ +    const __m128i& in0, const __m128i& in1, const __m128i& in2, const __m128i& in3) +{      const __m128i lo = _mm_packs_epi32(in0, in1);      const __m128i hi = _mm_packs_epi32(in2, in3);      return _mm_packs_epi16(lo, hi);  }  UHD_INLINE __m128i pack_sc32_4x( -    const __m128d &lo, const __m128d &hi, -    const __m128d &scalar -){ +    const __m128d& lo, const __m128d& hi, const __m128d& scalar) +{      const __m128i tmpi_lo = _mm_cvttpd_epi32(_mm_mul_pd(hi, scalar));      const __m128i tmpi_hi = _mm_cvttpd_epi32(_mm_mul_pd(lo, scalar));      return _mm_unpacklo_epi64(tmpi_lo, tmpi_hi);  } -DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD){ -    const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD) +{ +    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128d scalar = _mm_set1_pd(scale_factor); -    #define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_)             \ -    for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){                       \ -        /* load from input */                                           \ -        __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ -        __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ -        __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ -        __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ -        __m128d tmp4 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+4)); \ -        __m128d tmp5 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+5)); \ -        __m128d tmp6 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+6)); \ -        __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \ -                                                                        \ -        /* interleave */                                                \ -        const __m128i tmpi = pack_sc8_item32_4x(                        \ -            pack_sc32_4x(tmp1, tmp0, scalar),                           \ -            pack_sc32_4x(tmp3, tmp2, scalar),                           \ -            pack_sc32_4x(tmp5, tmp4, scalar),                           \ -            pack_sc32_4x(tmp7, tmp6, scalar)                            \ -        );                                                              \ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi);  \ -    }                                                                   \ +#define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_)                           \ +    for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) {                          \ +        /* load from input */                                                     \ +        __m128d tmp0 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));   \ +        __m128d tmp1 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1));   \ +        __m128d tmp2 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));   \ +        __m128d tmp3 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3));   \ +        __m128d tmp4 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));   \ +        __m128d tmp5 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5));   \ +        __m128d tmp6 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));   \ +        __m128d tmp7 =                                                            \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7));   \ +                                                                                  \ +        /* interleave */                                                          \ +        const __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar), \ +            pack_sc32_4x(tmp3, tmp2, scalar),                                     \ +            pack_sc32_4x(tmp5, tmp4, scalar),                                     \ +            pack_sc32_4x(tmp7, tmp6, scalar));                                    \ +                                                                                  \ +        /* store to output */                                                     \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi);           \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(input) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(input) & 0xf) == 0) {          convert_fc64_1_to_sc8_item32_1_bswap_guts(_) -    } -    else{ +    } else {          convert_fc64_1_to_sc8_item32_1_bswap_guts(u_)      } -    //convert remainder -    xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor); +    // convert remainder +    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);  } -DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD){ -    const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD) +{ +    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      const __m128d scalar = _mm_set1_pd(scale_factor); -    #define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_)             \ -    for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){                       \ -        /* load from input */                                           \ -        __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ -        __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ -        __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ -        __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ -        __m128d tmp4 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+4)); \ -        __m128d tmp5 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+5)); \ -        __m128d tmp6 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+6)); \ -        __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \ -                                                                        \ -        /* interleave */                                                \ -        __m128i tmpi = pack_sc8_item32_4x(                              \ -            pack_sc32_4x(tmp0, tmp1, scalar),                           \ -            pack_sc32_4x(tmp2, tmp3, scalar),                           \ -            pack_sc32_4x(tmp4, tmp5, scalar),                           \ -            pack_sc32_4x(tmp6, tmp7, scalar)                            \ -        );                                                              \ -        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\ -                                                                        \ -        /* store to output */                                           \ -        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi);  \ -    }                                                                   \ +#define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_)                                  \ +    for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) {                                 \ +        /* load from input */                                                            \ +        __m128d tmp0 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));          \ +        __m128d tmp1 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1));          \ +        __m128d tmp2 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));          \ +        __m128d tmp3 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3));          \ +        __m128d tmp4 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));          \ +        __m128d tmp5 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5));          \ +        __m128d tmp6 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));          \ +        __m128d tmp7 =                                                                   \ +            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7));          \ +                                                                                         \ +        /* interleave */                                                                 \ +        __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar),              \ +            pack_sc32_4x(tmp2, tmp3, scalar),                                            \ +            pack_sc32_4x(tmp4, tmp5, scalar),                                            \ +            pack_sc32_4x(tmp6, tmp7, scalar));                                           \ +        tmpi =                                                                           \ +            _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \ +                                                                                         \ +        /* store to output */                                                            \ +        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi);                  \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(input) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(input) & 0xf) == 0) {          convert_fc64_1_to_sc8_item32_1_nswap_guts(_) -    } -    else{ +    } else {          convert_fc64_1_to_sc8_item32_1_nswap_guts(u_)      } -    //convert remainder -    xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor); +    // convert remainder +    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);  } diff --git a/host/lib/convert/sse2_sc16_to_fc32.cpp b/host/lib/convert/sse2_sc16_to_fc32.cpp index d75c4a2a7..a16ef30d4 100644 --- a/host/lib/convert/sse2_sc16_to_fc32.cpp +++ b/host/lib/convert/sse2_sc16_to_fc32.cpp @@ -11,105 +11,111 @@  using namespace uhd::convert; -DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); -    fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); +    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]); -    const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); +    const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 16));      const __m128i zeroi = _mm_setzero_si128(); -    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time -    #define convert_item32_1_to_fc32_1_nswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* unpack + swap 16-bit pairs */                                \ -        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ -        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_nswap_guts(_al_)                                    \ +    for (; i + 3 < nsamps; i += 4) {                                                   \ +        /* load from input */                                                          \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));   \ +                                                                                       \ +        /* unpack + swap 16-bit pairs */                                               \ +        tmpi           = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));           \ +        tmpi           = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));           \          __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ -        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);               \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);     \ -        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);     \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+0), tmplo); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+2), tmphi); \ -    }                                                                   \ +        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);                              \ +                                                                                       \ +        /* convert and scale */                                                        \ +        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);                    \ +        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);                    \ +                                                                                       \ +        /* store to output */                                                          \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 0), tmplo);          \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 2), tmphi);          \ +    }      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(output) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        convert_item32_1_to_fc32_1_nswap_guts(_) -        break; -    case 0x8: -        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes -        item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor); -        i++; -        // do faster processing of the bulk of the samples now that we are 16-byte aligned -        convert_item32_1_to_fc32_1_nswap_guts(_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store -        convert_item32_1_to_fc32_1_nswap_guts(u_) +    switch (size_t(output) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            convert_item32_1_to_fc32_1_nswap_guts(_) break; +        case 0x8: +            // the first sample is 8-byte aligned - process it to align the remainder of +            // the samples to 16-bytes +            item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor); +            i++; +            // do faster processing of the bulk of the samples now that we are 16-byte +            // aligned +            convert_item32_1_to_fc32_1_nswap_guts(_) break; +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load and store +            convert_item32_1_to_fc32_1_nswap_guts(u_)      }      // convert any remaining samples -    item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); +    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);  } -DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); -    fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); +    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]); -    const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); +    const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 16));      const __m128i zeroi = _mm_setzero_si128(); -    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time -    #define convert_item32_1_to_fc32_1_bswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* byteswap + unpack -> byteswap 16 bit words */                \ -        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ -        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ -        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);               \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);     \ -        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);     \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+0), tmplo); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+2), tmphi); \ -    }                                                                   \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_bswap_guts(_al_)                                      \ +    for (; i + 3 < nsamps; i += 4) {                                                     \ +        /* load from input */                                                            \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));     \ +                                                                                         \ +        /* byteswap + unpack -> byteswap 16 bit words */                                 \ +        tmpi           = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ +        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */   \ +        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);                                \ +                                                                                         \ +        /* convert and scale */                                                          \ +        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);                      \ +        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);                      \ +                                                                                         \ +        /* store to output */                                                            \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 0), tmplo);            \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 2), tmphi);            \ +    }      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(output) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        convert_item32_1_to_fc32_1_bswap_guts(_) -        break; -    case 0x8: -        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes -        item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor); -        i++; -        // do faster processing of the bulk of the samples now that we are 16-byte aligned -        convert_item32_1_to_fc32_1_bswap_guts(_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store -        convert_item32_1_to_fc32_1_bswap_guts(u_) +    switch (size_t(output) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            convert_item32_1_to_fc32_1_bswap_guts(_) break; +        case 0x8: +            // the first sample is 8-byte aligned - process it to align the remainder of +            // the samples to 16-bytes +            item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor); +            i++; +            // do faster processing of the bulk of the samples now that we are 16-byte +            // aligned +            convert_item32_1_to_fc32_1_bswap_guts(_) break; +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load and store +            convert_item32_1_to_fc32_1_bswap_guts(u_)      }      // convert any remaining samples -    item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); +    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);  } diff --git a/host/lib/convert/sse2_sc16_to_fc64.cpp b/host/lib/convert/sse2_sc16_to_fc64.cpp index 7f22fd07f..45821ac9f 100644 --- a/host/lib/convert/sse2_sc16_to_fc64.cpp +++ b/host/lib/convert/sse2_sc16_to_fc64.cpp @@ -11,95 +11,95 @@  using namespace uhd::convert; -DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); -    fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); +    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]); -    const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16)); -    const __m128i zeroi = _mm_setzero_si128(); +    const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 16)); +    const __m128i zeroi  = _mm_setzero_si128(); -    #define convert_item32_1_to_fc64_1_nswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* unpack + swap 16-bit pairs */                                \ -        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ -        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \ +#define convert_item32_1_to_fc64_1_nswap_guts(_al_)                                    \ +    for (; i + 3 < nsamps; i += 4) {                                                   \ +        /* load from input */                                                          \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));   \ +                                                                                       \ +        /* unpack + swap 16-bit pairs */                                               \ +        tmpi           = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));           \ +        tmpi           = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));           \          __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ -        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);               \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);     \ -        tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi);                     \ -        __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);     \ -        __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);     \ -        tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi);                     \ -        __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);     \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+0), tmp0); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+1), tmp1); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+2), tmp2); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+3), tmp3); \ -    }                                                                   \ +        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);                              \ +                                                                                       \ +        /* convert and scale */                                                        \ +        __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);                    \ +        tmpilo       = _mm_unpackhi_epi64(tmpilo, zeroi);                              \ +        __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);                    \ +        __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);                    \ +        tmpihi       = _mm_unpackhi_epi64(tmpihi, zeroi);                              \ +        __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);                    \ +                                                                                       \ +        /* store to output */                                                          \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 0), tmp0);          \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 1), tmp1);          \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 2), tmp2);          \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 3), tmp3);          \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(output) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(output) & 0xf) == 0) {          convert_item32_1_to_fc64_1_nswap_guts(_) -    } -    else{ +    } else {          convert_item32_1_to_fc64_1_nswap_guts(u_)      } -    //convert remainder -    item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); +    // convert remainder +    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);  } -DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); -    fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); +    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]); -    const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16)); -    const __m128i zeroi = _mm_setzero_si128(); +    const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 16)); +    const __m128i zeroi  = _mm_setzero_si128(); -    #define convert_item32_1_to_fc64_1_bswap_guts(_al_)                 \ -    for (; i+3 < nsamps; i+=4){                                         \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* byteswap + unpack -> byteswap 16 bit words */                \ -        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ -        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ -        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);               \ -                                                                        \ -        /* convert and scale */                                         \ -        __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);     \ -        tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi);                     \ -        __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);     \ -        __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);     \ -        tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi);                     \ -        __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);     \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+0), tmp0); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+1), tmp1); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+2), tmp2); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+3), tmp3); \ -    }                                                                   \ +#define convert_item32_1_to_fc64_1_bswap_guts(_al_)                                      \ +    for (; i + 3 < nsamps; i += 4) {                                                     \ +        /* load from input */                                                            \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));     \ +                                                                                         \ +        /* byteswap + unpack -> byteswap 16 bit words */                                 \ +        tmpi           = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ +        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */   \ +        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);                                \ +                                                                                         \ +        /* convert and scale */                                                          \ +        __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);                      \ +        tmpilo       = _mm_unpackhi_epi64(tmpilo, zeroi);                                \ +        __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar);                      \ +        __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);                      \ +        tmpihi       = _mm_unpackhi_epi64(tmpihi, zeroi);                                \ +        __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar);                      \ +                                                                                         \ +        /* store to output */                                                            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 0), tmp0);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 1), tmp1);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 2), tmp2);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 3), tmp3);            \ +    }      size_t i = 0; -    //dispatch according to alignment -    if ((size_t(output) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(output) & 0xf) == 0) {          convert_item32_1_to_fc64_1_bswap_guts(_) -    } -    else{ +    } else {          convert_item32_1_to_fc64_1_bswap_guts(u_)      } -    //convert remainder -    item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); +    // convert remainder +    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);  } diff --git a/host/lib/convert/sse2_sc16_to_sc16.cpp b/host/lib/convert/sse2_sc16_to_sc16.cpp index 5c81f357b..e484bee31 100644 --- a/host/lib/convert/sse2_sc16_to_sc16.cpp +++ b/host/lib/convert/sse2_sc16_to_sc16.cpp @@ -25,20 +25,20 @@ using namespace uhd::convert;  //      | C | D | A | B |   Output  //      -----------------  // -#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_,_oalign_)          \ -    for (; i+3 < nsamps; i+=4) {                                        \ -        __m128i m0;                                                     \ -                                                                        \ -        /* load from input */                                           \ -        m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\ -                                                                        \ -        /* swap 16-bit pairs */                                         \ -        m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));          \ -        m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));          \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0);     \ -    }                                                                   \ +#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_, _oalign_)      \ +    for (; i + 3 < nsamps; i += 4) {                                 \ +        __m128i m0;                                                  \ +                                                                     \ +        /* load from input */                                        \ +        m0 = _mm_load##_ialign_##si128((const __m128i*)(input + i)); \ +                                                                     \ +        /* swap 16-bit pairs */                                      \ +        m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));       \ +        m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));       \ +                                                                     \ +        /* store to output */                                        \ +        _mm_store##_oalign_##si128((__m128i*)(output + i), m0);      \ +    }  //  // SSE byte swap @@ -54,138 +54,158 @@ using namespace uhd::convert;  //      | B | A | D | C |   Output  //      -----------------  // -#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_,_oalign_)          \ -    for (; i+3 < nsamps; i+=4) {                                        \ -        __m128i m0, m1, m2;                                             \ -                                                                        \ -        /* load from input */                                           \ -        m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\ -                                                                        \ -        /* byteswap 16 bit words */                                     \ -        m1 = _mm_srli_epi16(m0, 8);                                     \ -        m2 = _mm_slli_epi16(m0, 8);                                     \ -        m0 = _mm_or_si128(m1, m2);                                      \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0);     \ -    }                                                                   \ - -DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD){ -    const sc16_t *input = reinterpret_cast<const sc16_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_, _oalign_)      \ +    for (; i + 3 < nsamps; i += 4) {                                 \ +        __m128i m0, m1, m2;                                          \ +                                                                     \ +        /* load from input */                                        \ +        m0 = _mm_load##_ialign_##si128((const __m128i*)(input + i)); \ +                                                                     \ +        /* byteswap 16 bit words */                                  \ +        m1 = _mm_srli_epi16(m0, 8);                                  \ +        m2 = _mm_slli_epi16(m0, 8);                                  \ +        m0 = _mm_or_si128(m1, m2);                                   \ +                                                                     \ +        /* store to output */                                        \ +        _mm_store##_oalign_##si128((__m128i*)(output + i), m0);      \ +    } + +DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ +    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(input) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_) -        break; -    case 0x8: -        if (nsamps < 2) +    switch (size_t(input) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_) +            break; +        case 0x8: +            if (nsamps < 2) +                break; +            // the first sample is 8-byte aligned - process it to align the remainder of +            // the samples to 16-bytes +            xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0); +            i += 2; +            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_) +            // do faster processing of the bulk of the samples now that we are 16-byte +            // aligned              break; -        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes -        xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0); -        i += 2; -        CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_) -        // do faster processing of the bulk of the samples now that we are 16-byte aligned -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load -        CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_) +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load +            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_)      }      // convert any remaining samples -    xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, 1.0); +    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);  } -DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD){ -    const sc16_t *input = reinterpret_cast<const sc16_t *>(inputs[0]); -    item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ +    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]); +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(input) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_) -        break; -    case 0x8: -        if (nsamps < 2) +    switch (size_t(input) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_)              break; -        // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion -        xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0); -        i += 2; -        // do faster processing of the remaining samples now that we are 16-byte aligned -        CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load -        CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_) +        case 0x8: +            if (nsamps < 2) +                break; +            // the first value is 8-byte aligned - process it and prepare the bulk of the +            // data for fast conversion +            xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0); +            i += 2; +            // do faster processing of the remaining samples now that we are 16-byte +            // aligned +            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_) +            break; +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load +            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_)      }      // convert any remaining samples -    xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, 1.0); +    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);  } -DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); -    sc16_t *output = reinterpret_cast<sc16_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); +    sc16_t* output        = reinterpret_cast<sc16_t*>(outputs[0]);      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(output) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_) -        break; -    case 0x8: -        if (nsamps < 2) +    switch (size_t(output) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _) +            break; +        case 0x8: +            if (nsamps < 2) +                break; +            // the first sample is 8-byte aligned - process it to align the remainder of +            // the samples to 16-bytes +            item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0); +            i += 2; +            // do faster processing of the bulk of the samples now that we are 16-byte +            // aligned +            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _)              break; -        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes -        item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0); -        i += 2; -        // do faster processing of the bulk of the samples now that we are 16-byte aligned -        CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store -        CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_) +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load and store +            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_)      }      // convert any remaining samples -    item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, 1.0); +    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);  } -DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); -    sc16_t *output = reinterpret_cast<sc16_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); +    sc16_t* output        = reinterpret_cast<sc16_t*>(outputs[0]);      size_t i = 0;      // need to dispatch according to alignment for fastest conversion -    switch (size_t(output) & 0xf){ -    case 0x0: -        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples -        CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_) -        break; -    case 0x8: -        if (nsamps < 2) +    switch (size_t(output) & 0xf) { +        case 0x0: +            // the data is 16-byte aligned, so do the fast processing of the bulk of the +            // samples +            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _) +            break; +        case 0x8: +            if (nsamps < 2) +                break; +            // the first sample is 8-byte aligned - process it to align the remainder of +            // the samples to 16-bytes +            item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0); +            i += 2; +            // do faster processing of the bulk of the samples now that we are 16-byte +            // aligned +            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _)              break; -        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes -        item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0); -        i += 2; -        // do faster processing of the bulk of the samples now that we are 16-byte aligned -        CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_) -        break; -    default: -        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store -        CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_) +        default: +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned +            // load and store +            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_)      }      // convert any remaining samples -    item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, 1.0); +    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);  } diff --git a/host/lib/convert/sse2_sc8_to_fc32.cpp b/host/lib/convert/sse2_sc8_to_fc32.cpp index 6d68850bf..aefda2b13 100644 --- a/host/lib/convert/sse2_sc8_to_fc32.cpp +++ b/host/lib/convert/sse2_sc8_to_fc32.cpp @@ -14,109 +14,111 @@ using namespace uhd::convert;  static const __m128i zeroi = _mm_setzero_si128();  template <const int shuf> -UHD_INLINE void unpack_sc32_4x( -    const __m128i &in, -    __m128 &out0, __m128 &out1, -    __m128 &out2, __m128 &out3, -    const __m128 &scalar -){ +UHD_INLINE void unpack_sc32_4x(const __m128i& in, +    __m128& out0, +    __m128& out1, +    __m128& out2, +    __m128& out3, +    const __m128& scalar) +{      const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ -    __m128i tmp0 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */ +    __m128i tmp0        = _mm_shuffle_epi32( +        _mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */      __m128i tmp1 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmplo), shuf); -    out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar); -    out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar); +    out0         = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar); +    out1         = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar);      const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in); -    __m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf); -    __m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf); -    out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar); -    out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar); +    __m128i tmp2        = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf); +    __m128i tmp3        = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf); +    out2                = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar); +    out3                = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar);  } -DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); -    fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); +    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]); -    const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); -    const int shuf = _MM_SHUFFLE(3, 2, 1, 0); +    const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 24)); +    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0);      size_t i = 0, j = 0;      fc32_t dummy;      size_t num_samps = nsamps; -    if ((size_t(inputs[0]) & 0x3) != 0){ +    if ((size_t(inputs[0]) & 0x3) != 0) {          item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);          num_samps--;      } -    #define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_)             \ -    for (; j+7 < num_samps; j+=8, i+=4){                                \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* unpack + swap 8-bit pairs */                                 \ -        __m128 tmp0, tmp1, tmp2, tmp3;                                  \ -        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ +#define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_)                              \ +    for (; j + 7 < num_samps; j += 8, i += 4) {                                      \ +        /* load from input */                                                        \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ +                                                                                     \ +        /* unpack + swap 8-bit pairs */                                              \ +        __m128 tmp0, tmp1, tmp2, tmp3;                                               \ +        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar);                  \ +                                                                                     \ +        /* store to output */                                                        \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 0), tmp0);         \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 2), tmp1);         \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 4), tmp2);         \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 6), tmp3);         \      } -    //dispatch according to alignment -    if ((size_t(output) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(output) & 0xf) == 0) {          convert_sc8_item32_1_to_fc32_1_bswap_guts(_) -    } -    else{ +    } else {          convert_sc8_item32_1_to_fc32_1_bswap_guts(u_)      } -    //convert remainder -    item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); +    // convert remainder +    item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor);  } -DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); -    fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); +    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]); -    const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); -    const int shuf = _MM_SHUFFLE(0, 1, 2, 3); +    const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 24)); +    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3);      size_t i = 0, j = 0;      fc32_t dummy;      size_t num_samps = nsamps; -    if ((size_t(inputs[0]) & 0x3) != 0){ +    if ((size_t(inputs[0]) & 0x3) != 0) {          item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);          num_samps--;      } -    #define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_)             \ -    for (; j+7 < num_samps; j+=8, i+=4){                                \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* unpack + swap 8-bit pairs */                                 \ -        __m128 tmp0, tmp1, tmp2, tmp3;                                  \ -        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ -        _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ +#define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_)                              \ +    for (; j + 7 < num_samps; j += 8, i += 4) {                                      \ +        /* load from input */                                                        \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ +                                                                                     \ +        /* unpack + swap 8-bit pairs */                                              \ +        __m128 tmp0, tmp1, tmp2, tmp3;                                               \ +        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar);                  \ +                                                                                     \ +        /* store to output */                                                        \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 0), tmp0);         \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 2), tmp1);         \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 4), tmp2);         \ +        _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 6), tmp3);         \      } -    //dispatch according to alignment -    if ((size_t(output) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(output) & 0xf) == 0) {          convert_sc8_item32_1_to_fc32_1_nswap_guts(_) -    } -    else{ +    } else {          convert_sc8_item32_1_to_fc32_1_nswap_guts(u_)      } -    //convert remainder -    item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); +    // convert remainder +    item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor);  } diff --git a/host/lib/convert/sse2_sc8_to_fc64.cpp b/host/lib/convert/sse2_sc8_to_fc64.cpp index f5b406152..3cc2fefd0 100644 --- a/host/lib/convert/sse2_sc8_to_fc64.cpp +++ b/host/lib/convert/sse2_sc8_to_fc64.cpp @@ -13,129 +13,133 @@ using namespace uhd::convert;  static const __m128i zeroi = _mm_setzero_si128(); -UHD_INLINE void unpack_sc32_8x( -    const __m128i &in, -    __m128d &out0, __m128d &out1, -    __m128d &out2, __m128d &out3, -    __m128d &out4, __m128d &out5, -    __m128d &out6, __m128d &out7, -    const __m128d &scalar -){ +UHD_INLINE void unpack_sc32_8x(const __m128i& in, +    __m128d& out0, +    __m128d& out1, +    __m128d& out2, +    __m128d& out3, +    __m128d& out4, +    __m128d& out5, +    __m128d& out6, +    __m128d& out7, +    const __m128d& scalar) +{      const int shuf = _MM_SHUFFLE(1, 0, 3, 2);      __m128i tmp;      const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ -    tmp = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */ -    out0 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); -    tmp = _mm_shuffle_epi32(tmp, shuf); -    out1 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); -    tmp = _mm_unpackhi_epi16(zeroi, tmplo); -    out2 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); -    tmp = _mm_shuffle_epi32(tmp, shuf); -    out3 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */ +    out0                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_shuffle_epi32(tmp, shuf); +    out1                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_unpackhi_epi16(zeroi, tmplo); +    out2                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_shuffle_epi32(tmp, shuf); +    out3                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);      const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in); -    tmp = _mm_unpacklo_epi16(zeroi, tmphi); -    out4 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); -    tmp = _mm_shuffle_epi32(tmp, shuf); -    out5 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); -    tmp = _mm_unpackhi_epi16(zeroi, tmphi); -    out6 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); -    tmp = _mm_shuffle_epi32(tmp, shuf); -    out7 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_unpacklo_epi16(zeroi, tmphi); +    out4                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_shuffle_epi32(tmp, shuf); +    out5                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_unpackhi_epi16(zeroi, tmphi); +    out6                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +    tmp                 = _mm_shuffle_epi32(tmp, shuf); +    out7                = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);  } -DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); -    fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); +    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]); -    const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24)); +    const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 24));      size_t i = 0, j = 0;      fc32_t dummy;      size_t num_samps = nsamps; -    if ((size_t(inputs[0]) & 0x3) != 0){ +    if ((size_t(inputs[0]) & 0x3) != 0) {          item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);          num_samps--;      } -    #define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_)             \ -    for (; j+7 < num_samps; j+=8, i+=4){                                \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* unpack */                                                    \ -        __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;         \ +#define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_)                               \ +    for (; j + 7 < num_samps; j += 8, i += 4) {                                       \ +        /* load from input */                                                         \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));  \ +                                                                                      \ +        /* unpack */                                                                  \ +        __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;                       \          unpack_sc32_8x(tmpi, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, scalar); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \ +                                                                                      \ +        /* store to output */                                                         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 0), tmp0);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 1), tmp1);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 2), tmp2);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 3), tmp3);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 4), tmp4);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 5), tmp5);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 6), tmp6);         \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 7), tmp7);         \      } -    //dispatch according to alignment -    if ((size_t(output) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(output) & 0xf) == 0) {          convert_sc8_item32_1_to_fc64_1_bswap_guts(_) -    } -    else{ +    } else {          convert_sc8_item32_1_to_fc64_1_bswap_guts(u_)      } -    //convert remainder -    item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); +    // convert remainder +    item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor);  } -DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD){ -    const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); -    fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD) +{ +    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); +    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]); -    const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24)); +    const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 24));      size_t i = 0, j = 0;      fc32_t dummy;      size_t num_samps = nsamps; -    if ((size_t(inputs[0]) & 0x3) != 0){ +    if ((size_t(inputs[0]) & 0x3) != 0) {          item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);          num_samps--;      } -    #define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_)             \ -    for (; j+7 < num_samps; j+=8, i+=4){                                \ -        /* load from input */                                           \ -        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ -                                                                        \ -        /* unpack */                                                    \ -        __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;         \ -        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\ -        unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar); \ -                                                                        \ -        /* store to output */                                           \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \ -        _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \ +#define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_)                                  \ +    for (; j + 7 < num_samps; j += 8, i += 4) {                                          \ +        /* load from input */                                                            \ +        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));     \ +                                                                                         \ +        /* unpack */                                                                     \ +        __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;                          \ +        tmpi =                                                                           \ +            _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \ +        unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar);    \ +                                                                                         \ +        /* store to output */                                                            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 0), tmp0);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 1), tmp1);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 2), tmp2);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 3), tmp3);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 4), tmp4);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 5), tmp5);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 6), tmp6);            \ +        _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 7), tmp7);            \      } -    //dispatch according to alignment -    if ((size_t(output) & 0xf) == 0){ +    // dispatch according to alignment +    if ((size_t(output) & 0xf) == 0) {          convert_sc8_item32_1_to_fc64_1_nswap_guts(_) -    } -    else{ +    } else {          convert_sc8_item32_1_to_fc64_1_nswap_guts(u_)      } -    //convert remainder -    item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); +    // convert remainder +    item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor);  }  | 
