Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions target/i386/latx/include/pclmul.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef _PCLMUL_H_
#define _PCLMUL_H_

#include "common.h"
#include "ir2.h"
#include "la-append.h"
#include "macro-inst.h"
#include "reg-alloc.h"

static inline void emit_pclmul_ctz_loop(IR2_OPND lhs, IR2_OPND rhs,
IR2_OPND res_lo, IR2_OPND res_hi)
{
IR2_OPND shift = ra_alloc_itemp();
IR2_OPND tmp = ra_alloc_itemp();
IR2_OPND loop_label = ra_alloc_label();
IR2_OPND end_label = ra_alloc_label();

la_xor(res_lo, res_lo, res_lo);
la_xor(res_hi, res_hi, res_hi);
la_beqz(lhs, end_label);

la_label(loop_label);
la_ctz_d(shift, lhs);
la_addi_d(tmp, lhs, -1);
la_and(lhs, lhs, tmp);
la_sll_d(tmp, rhs, shift);
la_xor(res_lo, res_lo, tmp);
li_d(tmp, 64);
la_sub_d(tmp, tmp, shift);
la_srl_d(tmp, rhs, tmp);
la_sltu(shift, zero_ir2_opnd, shift);
la_sub_d(shift, zero_ir2_opnd, shift);
la_and(tmp, tmp, shift);
la_xor(res_hi, res_hi, tmp);
la_bnez(lhs, loop_label);

la_label(end_label);
}

#endif
102 changes: 72 additions & 30 deletions target/i386/latx/translator/tr-avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "translate.h"
#include "hbr.h"
#include "tr-vpaes.h"
#include "pclmul.h"

#ifdef CONFIG_LATX_AVX_OPT
bool translate_vaddpd(IR1_INST * pir1) {
Expand Down Expand Up @@ -5043,43 +5044,84 @@ bool translate_vpclmulqdq(IR1_INST * pir1) {
int s1 = ir1_opnd_base_reg_num(opnd1);
uint8_t ctrl = ir1_opnd_uimm(opnd3);

ADDR helper_func;

int helper_kind = 0;
if (ir1_opnd_is_ymm(opnd0)) {
helper_func = (ADDR)helper_vpclmulqdq_ymm;
helper_kind = LOAD_HELPER_VPCLMULQDQ_YMM;
} else {
helper_func = (ADDR)helper_vpclmulqdq_xmm;
helper_kind = LOAD_HELPER_VPCLMULQDQ_XMM;
}

if (!ir1_opnd_is_mem(opnd2)) {
int s2 = ir1_opnd_base_reg_num(opnd2);
tr_gen_call_to_helper_pclmulqdq((ADDR)helper_func, s0, s1, s2, ctrl, 0,
helper_kind);
} else {
int s2 = 0;
while (s2 < 8) {
if (s2 != s0 && s2 != s1) {
break;
IR2_OPND src1 = load_freg256_from_ir1(opnd1);
IR2_OPND src2 = ir1_opnd_is_mem(opnd2) ? ra_alloc_ftemp() : load_freg256_from_ir1(opnd2);
IR2_OPND src1_copy = src1;
IR2_OPND src2_copy = src2;
IR2_OPND lhs = ra_alloc_itemp();
IR2_OPND rhs = ra_alloc_itemp();
IR2_OPND res_lo = ra_alloc_itemp();
IR2_OPND res_hi = ra_alloc_itemp();

if (ir1_opnd_is_mem(opnd2)) {
if (ir1_opnd_size(opnd2) == 128) {
load_freg128_from_ir1_mem(src2, opnd2);
} else {
load_freg256_from_ir1_mem(src2, opnd2);
}
s2++;
}
IR2_OPND temp_mem = ra_alloc_ftemp();
IR2_OPND src = ra_alloc_xmm(s2);
la_xvor_v(temp_mem, src, src);
if (ir1_opnd_size(opnd2) == 128) {
load_freg128_from_ir1_mem(src, opnd2);
if (s0 == s1) {
src1_copy = ra_alloc_ftemp();
la_xvori_b(src1_copy, src1, 0);
}
if (!ir1_opnd_is_mem(opnd2) && s0 == ir1_opnd_base_reg_num(opnd2)) {
src2_copy = ra_alloc_ftemp();
la_xvori_b(src2_copy, src2, 0);
}

la_xvxor_v(dest, dest, dest);

la_xvpickve2gr_d(lhs, src1_copy, (ctrl & 1) ? 1 : 0);
la_xvpickve2gr_d(rhs, src2_copy, (ctrl & 0x10) ? 1 : 0);
emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi);
la_xvinsgr2vr_d(dest, res_lo, 0);
la_xvinsgr2vr_d(dest, res_hi, 1);

la_xvpickve2gr_d(lhs, src1_copy, (ctrl & 1) ? 3 : 2);
la_xvpickve2gr_d(rhs, src2_copy, (ctrl & 0x10) ? 3 : 2);
emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi);
la_xvinsgr2vr_d(dest, res_lo, 2);
la_xvinsgr2vr_d(dest, res_hi, 3);

ra_free_temp_auto(src2);
} else {
IR2_OPND src1 = ra_alloc_xmm(s1);
IR2_OPND src2;
IR2_OPND temp = ra_alloc_ftemp();
IR2_OPND ctrlp = ra_alloc_itemp();
IR2_OPND lhs = ra_alloc_itemp();
IR2_OPND rhs = ra_alloc_itemp();
IR2_OPND res_lo = ra_alloc_itemp();
IR2_OPND res_hi = ra_alloc_itemp();
IR2_OPND ftemp = ra_alloc_ftemp();

if (!ir1_opnd_is_mem(opnd2)) {
src2 = ra_alloc_xmm(ir1_opnd_base_reg_num(opnd2));
} else {
load_freg256_from_ir1_mem(src, opnd2);
src2 = ra_alloc_ftemp();
load_freg128_from_ir1_mem(src2, opnd2);
}
tr_gen_call_to_helper_pclmulqdq((ADDR)helper_func, s0, s1, s2, ctrl, 0,
helper_kind);
la_xvor_v(src, temp_mem, temp_mem);

li_d(ctrlp, ctrl);
la_andi(lhs, ctrlp, 1);
la_vreplve_d(ftemp, src1, lhs);
la_vpickve2gr_d(lhs, ftemp, 0);
la_bstrpick_d(rhs, ctrlp, 4, 4);
la_vreplve_d(ftemp, src2, rhs);
la_vpickve2gr_d(rhs, ftemp, 0);

emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi);
la_vxor_v(temp, temp, temp);
la_vinsgr2vr_d(temp, res_lo, 0);
la_vinsgr2vr_d(temp, res_hi, 1);
set_high128_xreg_to_zero(temp);
la_xvori_b(dest, temp, 0);
ra_free_temp_auto(src2);
}
if (ir1_opnd_size(opnd2) == 128)
if (ir1_opnd_size(opnd2) == 128) {
set_high128_xreg_to_zero(dest);
}
return true;
}

Expand Down
73 changes: 26 additions & 47 deletions target/i386/latx/translator/tr-simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "translate.h"
#include "hbr.h"
#include "tr-vpaes.h"
#include "pclmul.h"

bool translate_por(IR1_INST *pir1)
{
Expand Down Expand Up @@ -3707,52 +3708,6 @@ bool translate_pcmpistrm(IR1_INST *pir1)
return true;
}

static void cal_pclmulqdq(IR2_OPND d, IR2_OPND v, IR2_OPND s, uint8_t ctrl)
{
IR2_OPND ctrlp = ra_alloc_itemp();
IR2_OPND a = ra_alloc_itemp();// a = al
IR2_OPND b = ra_alloc_itemp();
IR2_OPND ftemp = ra_alloc_ftemp();

/* 选取操作数 */
li_d(ctrlp, ctrl);
la_andi(a, ctrlp, 1);// ((ctrl 1) != 0)
la_vreplve_d( ftemp, v, a);
la_vpickve2gr_d( a, ftemp, 0);
la_bstrpick_d( b, ctrlp, 4, 4);
la_vreplve_d( ftemp, s, b);
la_vpickve2gr_d( b, ftemp, 0);
ra_free_temp(ctrlp);

IR2_OPND ah = ra_alloc_itemp();
IR2_OPND resl = ra_alloc_itemp();
IR2_OPND resh = ra_alloc_itemp();
IR2_OPND all_label = ra_alloc_label();
IR2_OPND bit_label = ra_alloc_label();
IR2_OPND end_label = ra_alloc_label();

/* 开始运算 */
la_and( resl, resl, zero_ir2_opnd);
la_and( resh, resh, zero_ir2_opnd);
la_and( ah, ah, zero_ir2_opnd);
la_beqz( b, end_label);
la_label( all_label);
la_andi( a0_ir2_opnd, b, 1);// b 1
la_srli_d( b, b, 1);
la_beqz(a0_ir2_opnd, bit_label);
la_xor( resl, resl, a);
la_xor( resh, resh, ah);
la_label( bit_label);
la_slli_d(a1_ir2_opnd, ah, 1);
la_bstrpick_d(ah, a, 63, 63);
la_or( ah, a1_ir2_opnd, ah);
la_slli_d( a, a, 0x1);
la_bnez( b, all_label);
la_label( end_label);
la_vinsgr2vr_d( d, resl, 0);
la_vinsgr2vr_d( d, resh, 1);
}

bool translate_pclmulqdq(IR1_INST * pir1) {
IR1_OPND * opnd0 = ir1_get_opnd(pir1, 0);
IR1_OPND * opnd1 = ir1_get_opnd(pir1, 1);
Expand All @@ -3762,16 +3717,40 @@ bool translate_pclmulqdq(IR1_INST * pir1) {
uint8_t ctrl = ir1_opnd_uimm(opnd2);
IR2_OPND dest = ra_alloc_xmm(s0);
IR2_OPND src;
IR2_OPND dest_copy = dest;
IR2_OPND ctrlp = ra_alloc_itemp();
IR2_OPND lhs = ra_alloc_itemp();
IR2_OPND rhs = ra_alloc_itemp();
IR2_OPND res_lo = ra_alloc_itemp();
IR2_OPND res_hi = ra_alloc_itemp();
IR2_OPND ftemp = ra_alloc_ftemp();

if (!ir1_opnd_is_mem(opnd1)) {
int s1 = ir1_opnd_base_reg_num(opnd1);
src = ra_alloc_xmm(s1);
if (s0 == s1) {
dest_copy = ra_alloc_ftemp();
la_vori_b(dest_copy, dest, 0);
src = dest_copy;
}
} else {
src = ra_alloc_ftemp();
load_freg128_from_ir1_mem(src, opnd1);
}

cal_pclmulqdq(dest, dest, src, ctrl);
li_d(ctrlp, ctrl);
la_andi(lhs, ctrlp, 1);
la_vreplve_d(ftemp, dest_copy, lhs);
la_vpickve2gr_d(lhs, ftemp, 0);
la_bstrpick_d(rhs, ctrlp, 4, 4);
la_vreplve_d(ftemp, src, rhs);
la_vpickve2gr_d(rhs, ftemp, 0);

emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi);
la_vxor_v(dest, dest, dest);
la_vinsgr2vr_d(dest, res_lo, 0);
la_vinsgr2vr_d(dest, res_hi, 1);
ra_free_temp_auto(src);
return true;
}

Expand Down