-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimdcsv.c
More file actions
149 lines (128 loc) · 5.7 KB
/
simdcsv.c
File metadata and controls
149 lines (128 loc) · 5.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#include <emmintrin.h>
#include <fcntl.h>
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>
#include <unistd.h>
#include <wmmintrin.h>
struct Parser {
__m512i delim_zmm;
__m512i nl_zmm;
__mmask64 prevquoted;
size_t nfields;
};
struct Parser initParser(size_t nfields, char delim, char nl) {
struct Parser parser = {};
parser.delim_zmm = _mm512_set1_epi8(delim);
parser.nl_zmm = _mm512_set1_epi8(nl);
parser.nfields = nfields;
return parser;
}
int csvRead(struct Parser *parser, const char *data, char **fields) {
__m512i data_zmm = _mm512_loadu_epi8(data);
__mmask64 nlcmp = _mm512_cmpeq_epi8_mask(data_zmm, parser->nl_zmm);
__mmask64 delimcmp = _mm512_cmpeq_epi8_mask(data_zmm, parser->delim_zmm);
__mmask64 fieldLocs = _kor_mask64(delimcmp, nlcmp);
#ifndef NO_QUOTES
__mmask64 quoteLocs =
_mm512_cmpeq_epi8_mask(data_zmm, _mm512_set1_epi8('"'));
__mmask64 packedQuotes = _pext_u64(quoteLocs, quoteLocs);
__mmask64 endQuoteLocs =
_kand_mask64(_kshiftri_mask64(fieldLocs, 1), quoteLocs);
__mmask64 startQuoteLocs =
_kand_mask64(quoteLocs, _kshiftli_mask64(fieldLocs, 1)); // ,""",""",
__mmask64 evenPackedQuotes = _kand_mask64(
packedQuotes, 0x5555555555555555 << (parser->prevquoted & 1ULL));
__mmask64 oddPackedQuotes = _kand_mask64(
packedQuotes, 0x5555555555555555 << (~parser->prevquoted & 1ULL));
__mmask64 evenQuoteLocs = _pdep_u64(evenPackedQuotes, quoteLocs);
__mmask64 oddQuoteLocs = _pdep_u64(oddPackedQuotes, quoteLocs);
quoteLocs = _kor_mask64(_kand_mask64(startQuoteLocs, evenQuoteLocs),
_kand_mask64(endQuoteLocs, oddQuoteLocs));
__mmask64 quotedVals =
_kxor_mask64(_mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0, quoteLocs), _mm_set1_epi8(0xFF), 0)),
parser->prevquoted);
fieldLocs = _kand_mask64(fieldLocs, ~quotedVals);
parser->prevquoted = (uint64_t)((int64_t)quotedVals >> 63);
#endif
__m512i lut7 = _mm512_set_epi64(64, 63, 62, 61, 60, 59, 58, 57);
__m512i lut6 = _mm512_set_epi64(56, 55, 54, 53, 52, 51, 50, 49);
__m512i lut5 = _mm512_set_epi64(48, 47, 46, 45, 44, 43, 42, 41);
__m512i lut4 = _mm512_set_epi64(40, 39, 38, 37, 36, 35, 34, 33);
__m512i lut3 = _mm512_set_epi64(32, 31, 30, 29, 28, 27, 26, 25);
__m512i lut2 = _mm512_set_epi64(24, 23, 22, 21, 20, 19, 18, 17);
__m512i lut1 = _mm512_set_epi64(16, 15, 14, 13, 12, 11, 10, 9);
__m512i lut0 = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
// could use sse4 pextb?
__mmask8 m7 = _kshiftri_mask64(fieldLocs, 7 * 8);
__mmask8 m6 = _kshiftri_mask64(fieldLocs, 6 * 8);
__mmask8 m5 = _kshiftri_mask64(fieldLocs, 5 * 8);
__mmask8 m4 = _kshiftri_mask64(fieldLocs, 4 * 8);
__mmask8 m3 = _kshiftri_mask64(fieldLocs, 3 * 8);
__mmask8 m2 = _kshiftri_mask64(fieldLocs, 2 * 8);
__mmask8 m1 = _kshiftri_mask64(fieldLocs, 1 * 8);
__mmask8 m0 = _kshiftri_mask64(fieldLocs, 0 * 8);
// can we vectorize this??? There's the AVX-512 bitops extension, but if I'm
// understanding correctly, very few CPUs support that
int m0prefcnt = 0;
int m1prefcnt = __builtin_popcount(m0);
int m2prefcnt = m1prefcnt + __builtin_popcount(m1);
int m3prefcnt = m2prefcnt + __builtin_popcount(m2);
int m4prefcnt = m3prefcnt + __builtin_popcount(m3);
int m5prefcnt = m4prefcnt + __builtin_popcount(m4);
int m6prefcnt = m5prefcnt + __builtin_popcount(m5);
int m7prefcnt = m6prefcnt + __builtin_popcount(m6);
__m512i ptr_zmm = _mm512_set1_epi64((uint64_t)data);
lut0 = _mm512_add_epi64(lut0, ptr_zmm);
lut1 = _mm512_add_epi64(lut1, ptr_zmm);
lut2 = _mm512_add_epi64(lut2, ptr_zmm);
lut3 = _mm512_add_epi64(lut3, ptr_zmm);
lut4 = _mm512_add_epi64(lut4, ptr_zmm);
lut5 = _mm512_add_epi64(lut5, ptr_zmm);
lut6 = _mm512_add_epi64(lut6, ptr_zmm);
lut7 = _mm512_add_epi64(lut7, ptr_zmm);
// we can combine this and the following storeu into a single instruction
// zen4 is just really slow at compressstoreu
__m512i ind0 = _mm512_maskz_compress_epi64(m0, lut0);
__m512i ind1 = _mm512_maskz_compress_epi64(m1, lut1);
__m512i ind2 = _mm512_maskz_compress_epi64(m2, lut2);
__m512i ind3 = _mm512_maskz_compress_epi64(m3, lut3);
__m512i ind4 = _mm512_maskz_compress_epi64(m4, lut4);
__m512i ind5 = _mm512_maskz_compress_epi64(m5, lut5);
__m512i ind6 = _mm512_maskz_compress_epi64(m6, lut6);
__m512i ind7 = _mm512_maskz_compress_epi64(m7, lut7);
_mm512_storeu_epi64(fields + m0prefcnt, ind0);
_mm512_storeu_epi64(fields + m1prefcnt, ind1);
_mm512_storeu_epi64(fields + m2prefcnt, ind2);
_mm512_storeu_epi64(fields + m3prefcnt, ind3);
_mm512_storeu_epi64(fields + m4prefcnt, ind4);
_mm512_storeu_epi64(fields + m5prefcnt, ind5);
_mm512_storeu_epi64(fields + m6prefcnt, ind6);
_mm512_storeu_epi64(fields + m7prefcnt, ind7);
return 0;
}
int main(int argc, char *argv[]) {
struct Parser parser = initParser(16, ',', '\n');
int fd = open(argv[1], O_RDONLY);
char *csv = calloc(1, 1024 * 128);
csv[0] = '\n';
int cnt = 0;
int n;
char *fields[64];
while ((n = read(fd, csv + (cnt == 0), 1024 * 128 - (cnt == 0)))) {
memset(csv + n, 0, MAX(1024 * 128 - n, 0));
for (int i = 0; i < n / 64 + 1; i++) {
csvRead(&parser, csv + i * 64, fields);
for (char **pfield = fields; *pfield; pfield++) {
// fprintf(stderr, "field at %zu (%.8s)\n",
// cnt + ((size_t)*pfield - (size_t)csv), *pfield);
}
}
cnt += n;
}
return EXIT_SUCCESS;
}