sd_fft: AVX512 implementation for some functions by user202729 · Pull Request #2715 · flintlib/flint

user202729 · 2026-06-10T15:17:40Z

This PR defines new types vec8dz and vec16dz, corresponding to
AVX512 register types, and use that to speed up some functions in sd_fft.c.

Only a limited number of functions in sd_fft.c are modified.
This is mostly to discuss what's the correct interface that we want in machine_vectors.h.
Note that vec8d is already taken. Changing vec8d from two ymm to one zmm register is not unconditionally profitable, since there would not be enough instruction level parallelism.

To limit the amount of code changes, I keep the original transformed element ordering, which in particular mean sd_fft_store_vec8dz_basecase_order becomes very terrible. (it takes only 24 cycles though.)

Original:

  bits coeffs   min
     5     32   203
     6     64   374
     7    128   697
     8    256  1477
     9    512  3155
    10   1024  6849

After first commit:

  bits coeffs   min
     5     32   203
     6     64   289
     7    128   695
     8    256  1218
     9    512  3145
    10   1024  5980

bits=6 shows 22% improvement and bits=8 shows 17% improvement.
bits=7 is unmodified (will need more work).

After second commit:

  bits coeffs   min
     5     32   203
     6     64   289
     7    128   702
     8    256  1110
     9    512  3294
    10   1024  5288

bits=8 shows 8% additional improvement.

The temp benchmark file (AI).

Details

#include <asm/unistd.h>
#include <errno.h>
#include <inttypes.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

#include "ulong_extras.h"
#include "fft_small.h"

typedef struct
{
    int fd;
    struct perf_event_mmap_page * pc;
    uint32_t ecx;
    uint16_t pmc_width;
} perf_cycle_counter_t;

static void
perf_cycle_counter_init(perf_cycle_counter_t * counter)
{
    struct perf_event_attr attr;

    memset(&attr, 0, sizeof(attr));
    attr.type = PERF_TYPE_HARDWARE;
    attr.size = sizeof(attr);
    attr.config = PERF_COUNT_HW_CPU_CYCLES;
    attr.exclude_kernel = 1;
    attr.exclude_hv = 1;

    counter->fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
    if (counter->fd < 0)
    {
        flint_printf("perf_event_open failed: %s\n", strerror(errno));
        flint_abort();
    }

    counter->pc = (struct perf_event_mmap_page *)
            mmap(NULL, 4096, PROT_READ, MAP_SHARED, counter->fd, 0);
    if (counter->pc == MAP_FAILED)
    {
        flint_printf("mmap failed: %s\n", strerror(errno));
        close(counter->fd);
        flint_abort();
    }

    if (!counter->pc->cap_user_rdpmc || counter->pc->index == 0)
    {
        flint_printf("userspace rdpmc is unavailable\n");
        munmap(counter->pc, 4096);
        close(counter->fd);
        flint_abort();
    }

    counter->ecx = counter->pc->index - 1;
    counter->pmc_width = counter->pc->pmc_width;

    ioctl(counter->fd, PERF_EVENT_IOC_RESET, 0);
    ioctl(counter->fd, PERF_EVENT_IOC_ENABLE, 0);
}

static void
perf_cycle_counter_clear(perf_cycle_counter_t * counter)
{
    ioctl(counter->fd, PERF_EVENT_IOC_DISABLE, 0);
    munmap(counter->pc, 4096);
    close(counter->fd);
}

static uint64_t
perf_cycle_counter_get_raw(const perf_cycle_counter_t * counter)
{
    uint32_t lo, hi;

    __asm__ volatile("lfence\n\trdpmc\n\tlfence"
                     : "=a"(lo), "=d"(hi)
                     : "c"(counter->ecx)
                     : "memory");

    return ((uint64_t) hi << 32) | lo;
}

static uint64_t
perf_cycle_counter_get_delta(const perf_cycle_counter_t * counter,
        uint64_t before, uint64_t after)
{
    return (after - before) & ((UINT64_C(1) << counter->pmc_width) - 1);
}

static uint64_t
time_sd_fft(perf_cycle_counter_t * counter, sd_fft_ctx_t Q, double * data,
        ulong L, ulong n)
{
    uint64_t before, after;

    before = perf_cycle_counter_get_raw(counter);
    sd_fft_trunc(Q, data, L, n, n);
    after = perf_cycle_counter_get_raw(counter);

    if (data[0] == -1.0)
        flint_printf("\r");

    return perf_cycle_counter_get_delta(counter, before, after);
}

int
main(void)
{
    const ulong reps = 1000;
    sd_fft_ctx_t Q;
    flint_rand_t state;
    perf_cycle_counter_t counter;

    flint_rand_init(state);
    sd_fft_ctx_init_prime(Q, UWORD(0x0003f00000000001));
    perf_cycle_counter_init(&counter);

    flint_printf("bits     coeffs       cycles\n");

    for (ulong i = 5; i <= 10; i++)
    {
        ulong n = UWORD(1) << i;
        uint64_t best = UINT64_MAX;
        double * data;

        sd_fft_ctx_fit_depth(Q, i);
        data = (double *) flint_aligned_alloc(64,
                FLINT_MAX(64, sd_fft_ctx_data_size(i) * sizeof(double)));

        for (ulong j = 0; j < n; j++)
            data[j] = n_randint(state, Q->mod.n);
        sd_fft_trunc(Q, data, i, n, n);

        for (ulong rep = 0; rep < reps; rep++)
        {
            uint64_t cycles;

            for (ulong j = 0; j < n; j++)
                data[j] = n_randint(state, Q->mod.n);

            cycles = time_sd_fft(&counter, Q, data, i, n);
            if (cycles < best)
                best = cycles;
        }

        flint_printf("%4wu %10wu %12" PRIu64 "\n", i, n, best);

        flint_aligned_free(data);
    }

    perf_cycle_counter_clear(&counter);
    sd_fft_ctx_clear(Q);
    flint_rand_clear(state);
    flint_cleanup_master();

    return 0;
}

Use perf events to count number of CPU cycles.
More reliable than rdtscp since clock frequency varies.
Might need sudo sysctl kernel.perf_event_paranoid=-1 to run this.

user202729 · 2026-06-10T15:48:20Z

https://www.texmacs.org/joris/ntt/ntt.html claims 51.4ns (xeon) and 16.5ns (zen4) for r=64... I guess more room for improvement is possible.

albinahlback · 2026-06-10T15:53:04Z

https://www.texmacs.org/joris/ntt/ntt.html claims 51.4ns (xeon) and 16.5ns (zen4) for r=64... I guess more room for improvement is possible.

I believe Joris optimized through SLPs, i.e. no branching at all, in the framework of Mathemagix (i.e. their own compiler).

user202729 · 2026-06-11T01:18:01Z

The sd_fft_basecase_6_0 and sd_fft_basecase_6_1 are straight-line programs too. There must be some other magic being done here...

user202729 added 2 commits June 11, 2026 20:14

sd_fft: AVX512 implementation for base case 6

31555f7

sd_fft: Use vec16dz at some places

acbd4dd

user202729 force-pushed the sd-fft-avx512 branch from 29a6814 to acbd4dd Compare June 11, 2026 13:16

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

sd_fft: AVX512 implementation for some functions#2715

sd_fft: AVX512 implementation for some functions#2715
user202729 wants to merge 2 commits into
flintlib:mainfrom
user202729:sd-fft-avx512

user202729 commented Jun 10, 2026

Uh oh!

user202729 commented Jun 10, 2026

Uh oh!

albinahlback commented Jun 10, 2026

Uh oh!

user202729 commented Jun 11, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

user202729 commented Jun 10, 2026

Uh oh!

user202729 commented Jun 10, 2026

Uh oh!

albinahlback commented Jun 10, 2026

Uh oh!

user202729 commented Jun 11, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants