Skip to content

sd_fft: AVX512 implementation for some functions#2715

Open
user202729 wants to merge 2 commits into
flintlib:mainfrom
user202729:sd-fft-avx512
Open

sd_fft: AVX512 implementation for some functions#2715
user202729 wants to merge 2 commits into
flintlib:mainfrom
user202729:sd-fft-avx512

Conversation

@user202729

Copy link
Copy Markdown
Contributor

This PR defines new types vec8dz and vec16dz, corresponding to
AVX512 register types, and use that to speed up some functions in sd_fft.c.

Only a limited number of functions in sd_fft.c are modified.
This is mostly to discuss what's the correct interface that we want in machine_vectors.h.
Note that vec8d is already taken. Changing vec8d from two ymm to one zmm register is not unconditionally profitable, since there would not be enough instruction level parallelism.

To limit the amount of code changes, I keep the original transformed element ordering, which in particular mean sd_fft_store_vec8dz_basecase_order becomes very terrible. (it takes only 24 cycles though.)

Original:

  bits coeffs   min
     5     32   203
     6     64   374
     7    128   697
     8    256  1477
     9    512  3155
    10   1024  6849

After first commit:

  bits coeffs   min
     5     32   203
     6     64   289
     7    128   695
     8    256  1218
     9    512  3145
    10   1024  5980

bits=6 shows 22% improvement and bits=8 shows 17% improvement.
bits=7 is unmodified (will need more work).

After second commit:

  bits coeffs   min
     5     32   203
     6     64   289
     7    128   702
     8    256  1110
     9    512  3294
    10   1024  5288

bits=8 shows 8% additional improvement.


The temp benchmark file (AI).

Details
#include <asm/unistd.h>
#include <errno.h>
#include <inttypes.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

#include "ulong_extras.h"
#include "fft_small.h"

typedef struct
{
    int fd;
    struct perf_event_mmap_page * pc;
    uint32_t ecx;
    uint16_t pmc_width;
} perf_cycle_counter_t;

static void
perf_cycle_counter_init(perf_cycle_counter_t * counter)
{
    struct perf_event_attr attr;

    memset(&attr, 0, sizeof(attr));
    attr.type = PERF_TYPE_HARDWARE;
    attr.size = sizeof(attr);
    attr.config = PERF_COUNT_HW_CPU_CYCLES;
    attr.exclude_kernel = 1;
    attr.exclude_hv = 1;

    counter->fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
    if (counter->fd < 0)
    {
        flint_printf("perf_event_open failed: %s\n", strerror(errno));
        flint_abort();
    }

    counter->pc = (struct perf_event_mmap_page *)
            mmap(NULL, 4096, PROT_READ, MAP_SHARED, counter->fd, 0);
    if (counter->pc == MAP_FAILED)
    {
        flint_printf("mmap failed: %s\n", strerror(errno));
        close(counter->fd);
        flint_abort();
    }

    if (!counter->pc->cap_user_rdpmc || counter->pc->index == 0)
    {
        flint_printf("userspace rdpmc is unavailable\n");
        munmap(counter->pc, 4096);
        close(counter->fd);
        flint_abort();
    }

    counter->ecx = counter->pc->index - 1;
    counter->pmc_width = counter->pc->pmc_width;

    ioctl(counter->fd, PERF_EVENT_IOC_RESET, 0);
    ioctl(counter->fd, PERF_EVENT_IOC_ENABLE, 0);
}

static void
perf_cycle_counter_clear(perf_cycle_counter_t * counter)
{
    ioctl(counter->fd, PERF_EVENT_IOC_DISABLE, 0);
    munmap(counter->pc, 4096);
    close(counter->fd);
}

static uint64_t
perf_cycle_counter_get_raw(const perf_cycle_counter_t * counter)
{
    uint32_t lo, hi;

    __asm__ volatile("lfence\n\trdpmc\n\tlfence"
                     : "=a"(lo), "=d"(hi)
                     : "c"(counter->ecx)
                     : "memory");

    return ((uint64_t) hi << 32) | lo;
}

static uint64_t
perf_cycle_counter_get_delta(const perf_cycle_counter_t * counter,
        uint64_t before, uint64_t after)
{
    return (after - before) & ((UINT64_C(1) << counter->pmc_width) - 1);
}

static uint64_t
time_sd_fft(perf_cycle_counter_t * counter, sd_fft_ctx_t Q, double * data,
        ulong L, ulong n)
{
    uint64_t before, after;

    before = perf_cycle_counter_get_raw(counter);
    sd_fft_trunc(Q, data, L, n, n);
    after = perf_cycle_counter_get_raw(counter);

    if (data[0] == -1.0)
        flint_printf("\r");

    return perf_cycle_counter_get_delta(counter, before, after);
}

int
main(void)
{
    const ulong reps = 1000;
    sd_fft_ctx_t Q;
    flint_rand_t state;
    perf_cycle_counter_t counter;

    flint_rand_init(state);
    sd_fft_ctx_init_prime(Q, UWORD(0x0003f00000000001));
    perf_cycle_counter_init(&counter);

    flint_printf("bits     coeffs       cycles\n");

    for (ulong i = 5; i <= 10; i++)
    {
        ulong n = UWORD(1) << i;
        uint64_t best = UINT64_MAX;
        double * data;

        sd_fft_ctx_fit_depth(Q, i);
        data = (double *) flint_aligned_alloc(64,
                FLINT_MAX(64, sd_fft_ctx_data_size(i) * sizeof(double)));

        for (ulong j = 0; j < n; j++)
            data[j] = n_randint(state, Q->mod.n);
        sd_fft_trunc(Q, data, i, n, n);

        for (ulong rep = 0; rep < reps; rep++)
        {
            uint64_t cycles;

            for (ulong j = 0; j < n; j++)
                data[j] = n_randint(state, Q->mod.n);

            cycles = time_sd_fft(&counter, Q, data, i, n);
            if (cycles < best)
                best = cycles;
        }

        flint_printf("%4wu %10wu %12" PRIu64 "\n", i, n, best);

        flint_aligned_free(data);
    }

    perf_cycle_counter_clear(&counter);
    sd_fft_ctx_clear(Q);
    flint_rand_clear(state);
    flint_cleanup_master();

    return 0;
}

Use perf events to count number of CPU cycles.
More reliable than rdtscp since clock frequency varies.
Might need sudo sysctl kernel.perf_event_paranoid=-1 to run this.

@user202729

Copy link
Copy Markdown
Contributor Author

https://www.texmacs.org/joris/ntt/ntt.html claims 51.4ns (xeon) and 16.5ns (zen4) for r=64... I guess more room for improvement is possible.

@albinahlback

Copy link
Copy Markdown
Collaborator

https://www.texmacs.org/joris/ntt/ntt.html claims 51.4ns (xeon) and 16.5ns (zen4) for r=64... I guess more room for improvement is possible.

I believe Joris optimized through SLPs, i.e. no branching at all, in the framework of Mathemagix (i.e. their own compiler).

@user202729

Copy link
Copy Markdown
Contributor Author

The sd_fft_basecase_6_0 and sd_fft_basecase_6_1 are straight-line programs too. There must be some other magic being done here...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants