Accelerate the unacceptable or get acquainted with SIMD, part 2 - AVX

    The previous part caused a heated discussion, during which it turned out that AVX / AVX2 actually exists in desktop CPUs, there is not only AVX512. Therefore, we continue to get acquainted with SIMD, but already with its modern part - AVX. And also analyze some comments:


    • Is it slower _mm256_load_si256than direct memory access?
    • Does using AVX commands affect SSE registers affect speed?
    • Is it really that bad to use _popcnt?

    A little about AVX


    AVX / AVX2 is a more powerful version of SSE, which expands most 128-bit SSE operations to 256 bits, plus brings a number of new instructions.


    From the subtleties of the implementation, it is possible to highlight that at the assembly level AVX uses 3 arguments, which allows not to destroy the data in the first two. SSE stores the result in one of the arguments.


    It is also necessary to take into account that when direct addressing data must be aligned by 32 bytes, in SSE alignment by 16.


    An updated version of the benchmark


    Changes:


    1. The number of elements has been increased by 10,000 times (up to 10,240,000), in order to guarantee not to fit into the processor's cache.
    2. Alignment changed from 16 bytes to 32 to support AVX.
    3. Added AVX implementations similar to SSE.

    Benchmark code
    #include<benchmark/benchmark.h>#include<x86intrin.h>#include<cstring>#define ARR_SIZE 10240000#define VAL 50static int16_t *getRandArr(){
        auto res = newint16_t[ARR_SIZE];
        for (int i = 0; i < ARR_SIZE; ++i) {
            res[i] = static_cast<int16_t>(rand() % (VAL * 2));
        }
        return res;
    }
    staticauto arr = getRandArr();
    static int16_t *getAllignedArr(){
        auto res = aligned_alloc(32, sizeof(int16_t) * ARR_SIZE);
        memcpy(res, arr, sizeof(int16_t) * ARR_SIZE);
        returnstatic_cast<int16_t *>(res);
    }
    staticauto allignedArr = getAllignedArr();
    staticvoidBM_Count(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            for (int i = 0; i < ARR_SIZE; ++i)
                if (arr[i] == VAL)
                    ++cnt;
            benchmark::DoNotOptimize(cnt);
        }
    }
    BENCHMARK(BM_Count);
    staticvoidBM_SSE_COUNT_SET_EPI(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            auto sseVal = _mm_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 8) {
                cnt += _popcnt32(
                        _mm_movemask_epi8(
                                _mm_cmpeq_epi16(
                                        sseVal,
                                        _mm_set_epi16(arr[i + 7], arr[i + 6], arr[i + 5], arr[i + 4],
                                                      arr[i + 3], arr[i + 2], arr[i + 1], arr[i])
                                )
                        )
                );
            }
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }
    BENCHMARK(BM_SSE_COUNT_SET_EPI);
    staticvoidBM_SSE_COUNT_LOADU(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            auto sseVal = _mm_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 8) {
                cnt += _popcnt32(
                        _mm_movemask_epi8(
                                _mm_cmpeq_epi16(
                                        sseVal,
                                        _mm_loadu_si128((__m128i *) &arr[i])
                                )
                        )
                );
            }
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }
    BENCHMARK(BM_SSE_COUNT_LOADU);
    staticvoidBM_SSE_COUNT_DIRECT(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            auto sseVal = _mm_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 8) {
                cnt += _popcnt32(
                        _mm_movemask_epi8(
                                _mm_cmpeq_epi16(
                                        sseVal,
                                        *(__m128i *) &allignedArr[i]
                                )
                        )
                );
            }
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }
    BENCHMARK(BM_SSE_COUNT_DIRECT);
    #ifdef __AVX2__staticvoidBM_AVX_COUNT_LOADU(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            auto avxVal = _mm256_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 16) {
                cnt += _popcnt32(
                        _mm256_movemask_epi8(
                                _mm256_cmpeq_epi16(
                                        avxVal,
                                        _mm256_loadu_si256((__m256i *) &arr[i])
                                )
                        )
                );
            }
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }
    BENCHMARK(BM_AVX_COUNT_LOADU);
    staticvoidBM_AVX_COUNT_LOAD(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            auto avxVal = _mm256_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 16) {
                cnt += _popcnt32(
                        _mm256_movemask_epi8(
                                _mm256_cmpeq_epi16(avxVal,
                                                   _mm256_load_si256((__m256i *) &allignedArr[i])
                                )
                        )
                );
            }
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }
    BENCHMARK(BM_AVX_COUNT_LOAD);
    staticvoidBM_AVX_COUNT_DIRECT(benchmark::State &state){
        for (auto _ : state) {
            int64_t cnt = 0;
            auto avxVal = _mm256_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 16) {
                cnt += _popcnt32(
                        _mm256_movemask_epi8(
                                _mm256_cmpeq_epi16(
                                        avxVal,
                                        *(__m256i *) &allignedArr[i]
                                )
                        )
                );
            }
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }
    BENCHMARK(BM_AVX_COUNT_DIRECT);
    #endif
    BENCHMARK_MAIN();

    New results look like (-O0):


    ---------------------------------------------------------------------
    Benchmark                              Time           CPU Iterations
    ---------------------------------------------------------------------
    BM_Count                        17226622 ns   17062958 ns         41
    BM_SSE_COUNT_SET_EPI             8901343 ns    8814845 ns         79
    BM_SSE_COUNT_LOADU               3664778 ns    3664766 ns        185
    BM_SSE_COUNT_DIRECT              3468436 ns    3468423 ns        202
    BM_AVX_COUNT_LOADU               2090817 ns    2090796 ns        343
    BM_AVX_COUNT_LOAD                1904424 ns    1904419 ns        364
    BM_AVX_COUNT_DIRECT              1814875 ns    1814854 ns        385

    Total cumulative acceleration is 9+ times, AVX is expected to be SSE almost 2 times faster.


    Is it slower _mm256_load_si256than direct memory access?


    There is no definite answer. With -O0slower direct handling, but faster _mm256_loadu_si256:


    ---------------------------------------------------------------------
    Benchmark                              Time           CPU Iterations
    ---------------------------------------------------------------------
    BM_AVX_COUNT_LOADU               2090817 ns    2090796 ns        343
    BM_AVX_COUNT_LOAD                1904424 ns    1904419 ns        364
    BM_AVX_COUNT_DIRECT              1814875 ns    1814854 ns        385

    With -O3faster than direct memory access, but still expectedly slower _mm256_loadu_si256.


    ---------------------------------------------------------------------
    Benchmark                              Time           CPU Iterations
    ---------------------------------------------------------------------
    BM_AVX_COUNT_LOADU                992319 ns     992368 ns        701
    BM_AVX_COUNT_LOAD                 956120 ns     956166 ns        712
    BM_AVX_COUNT_DIRECT              1027624 ns    1027674 ns        730

    In the production code, it’s still better to use _mm256_load_si256instead of direct conversion, this option the compiler can better optimize.


    Does using AVX commands over SSE registers affect speed?


    The short answer is no . For the experiment, I collected and launched a benchmark with -mavx2and with -msse4.2.


    -mavx2


    _popcnt32(_mm_movemask_epi8(_mm_cmpeq_epi16(...))) turns into


    vpcmpeqw %xmm1,%xmm0,%xmm0
    vpmovmskb %xmm0,%edx
    popcnt %edx,%edx

    Results:


    ------------------------------------------------------------
    Benchmark                     Time           CPU Iterations
    ------------------------------------------------------------
    BM_SSE_COUNT_SET_EPI    9376699 ns    9376767 ns         75
    BM_SSE_COUNT_LOADU      4425510 ns    4425560 ns        159
    BM_SSE_COUNT_DIRECT     3938604 ns    3938648 ns        177

    -msse4.2


    _popcnt32(_mm_movemask_epi8(_mm_cmpeq_epi16(...))) turns into


    pcmpeqw %xmm1,%xmm0
    pmovmskb %xmm0,%edx
    popcnt %edx,%edx

    Results:


    ------------------------------------------------------------
    Benchmark                     Time           CPU Iterations
    ------------------------------------------------------------
    BM_SSE_COUNT_SET_EPI    9309352 ns    9309375 ns         76
    BM_SSE_COUNT_LOADU      4382183 ns    4382195 ns        159
    BM_SSE_COUNT_DIRECT     3944579 ns    3944590 ns        176

    bonus


    AVX commands are _popcnt32(_mm256_movemask_epi8(_mm256_cmpeq_epi16(...)))turned into


    vpcmpeqw %ymm1,%ymm0,%ymm0
    vpmovmskb %ymm0,%edx
    popcnt %edx,%edx

    Is it really that bad to use _popcnt?


    In one of the comments Antervis wrote:


    And also, you have a little flawed algorithm. Why do through movemask + popcnt? For arrays of no more than 2 ^ 18 elements, you can first collect the elementwise sum:
    auto cmp = _mm_cmpeq_epi16 (sseVal, sseArr);
    cmp = _mm_and_si128 (cmp, _mm_set1_epi16 (1));
    sum = _mm_add_epi16 (sum, cmp);

    and then, at the end of the cycle, make one horizontal addition (not forgetting the overflow).

    I made a benchmark


    staticvoidBM_AVX_COUNT_DIRECT_WO_POPCNT(benchmark::State &state){
        auto avxVal1 = _mm256_set1_epi16(1);
        for (auto _ : state) {
            auto sum = _mm256_set1_epi16(0);
            auto avxVal = _mm256_set1_epi16(VAL);
            for (int i = 0; i < ARR_SIZE; i += 16) {
                sum = _mm256_add_epi16(
                        sum,
                        _mm256_and_si256(
                                avxVal1,
                                _mm256_cmpeq_epi16(
                                        avxVal,
                                        *(__m256i *) &allignedArr[i])
                        )
                );
            }
            auto arrSum = (uint16_t *) &sum;
            size_t cnt = 0;
            for (int j = 0; j < 16; ++j)
                cnt += arrSum[j];
            benchmark::DoNotOptimize(cnt >> 1);
        }
    }

    and it turned out to be slower c -O0:


    ---------------------------------------------------------------------
    Benchmark                              Time           CPU Iterations
    ---------------------------------------------------------------------
    BM_AVX_COUNT_DIRECT              1814821 ns    1814785 ns        392
    BM_AVX_COUNT_DIRECT_WO_POPCNT    2386289 ns    2386227 ns        287

    and a little faster with -O3:


    ---------------------------------------------------------------------
    Benchmark                              Time           CPU Iterations
    ---------------------------------------------------------------------
    BM_AVX_COUNT_DIRECT               960941 ns     960924 ns        722
    BM_AVX_COUNT_DIRECT_WO_POPCNT     948611 ns     948596 ns        732

    Also popular now: