diff --git a/target/i386/latx/include/pclmul.h b/target/i386/latx/include/pclmul.h new file mode 100644 index 0000000000..a12c209fd5 --- /dev/null +++ b/target/i386/latx/include/pclmul.h @@ -0,0 +1,40 @@ +#ifndef _PCLMUL_H_ +#define _PCLMUL_H_ + +#include "common.h" +#include "ir2.h" +#include "la-append.h" +#include "macro-inst.h" +#include "reg-alloc.h" + +static inline void emit_pclmul_ctz_loop(IR2_OPND lhs, IR2_OPND rhs, + IR2_OPND res_lo, IR2_OPND res_hi) +{ + IR2_OPND shift = ra_alloc_itemp(); + IR2_OPND tmp = ra_alloc_itemp(); + IR2_OPND loop_label = ra_alloc_label(); + IR2_OPND end_label = ra_alloc_label(); + + la_xor(res_lo, res_lo, res_lo); + la_xor(res_hi, res_hi, res_hi); + la_beqz(lhs, end_label); + + la_label(loop_label); + la_ctz_d(shift, lhs); + la_addi_d(tmp, lhs, -1); + la_and(lhs, lhs, tmp); + la_sll_d(tmp, rhs, shift); + la_xor(res_lo, res_lo, tmp); + li_d(tmp, 64); + la_sub_d(tmp, tmp, shift); + la_srl_d(tmp, rhs, tmp); + la_sltu(shift, zero_ir2_opnd, shift); + la_sub_d(shift, zero_ir2_opnd, shift); + la_and(tmp, tmp, shift); + la_xor(res_hi, res_hi, tmp); + la_bnez(lhs, loop_label); + + la_label(end_label); +} + +#endif diff --git a/target/i386/latx/translator/tr-avx.c b/target/i386/latx/translator/tr-avx.c index fd0c59f07a..e889a1dd94 100644 --- a/target/i386/latx/translator/tr-avx.c +++ b/target/i386/latx/translator/tr-avx.c @@ -4,6 +4,7 @@ #include "translate.h" #include "hbr.h" #include "tr-vpaes.h" +#include "pclmul.h" #ifdef CONFIG_LATX_AVX_OPT bool translate_vaddpd(IR1_INST * pir1) { @@ -5043,43 +5044,84 @@ bool translate_vpclmulqdq(IR1_INST * pir1) { int s1 = ir1_opnd_base_reg_num(opnd1); uint8_t ctrl = ir1_opnd_uimm(opnd3); - ADDR helper_func; - - int helper_kind = 0; if (ir1_opnd_is_ymm(opnd0)) { - helper_func = (ADDR)helper_vpclmulqdq_ymm; - helper_kind = LOAD_HELPER_VPCLMULQDQ_YMM; - } else { - helper_func = (ADDR)helper_vpclmulqdq_xmm; - helper_kind = LOAD_HELPER_VPCLMULQDQ_XMM; - } - - if (!ir1_opnd_is_mem(opnd2)) { - int s2 = ir1_opnd_base_reg_num(opnd2); - tr_gen_call_to_helper_pclmulqdq((ADDR)helper_func, s0, s1, s2, ctrl, 0, - helper_kind); - } else { - int s2 = 0; - while (s2 < 8) { - if (s2 != s0 && s2 != s1) { - break; + IR2_OPND src1 = load_freg256_from_ir1(opnd1); + IR2_OPND src2 = ir1_opnd_is_mem(opnd2) ? ra_alloc_ftemp() : load_freg256_from_ir1(opnd2); + IR2_OPND src1_copy = src1; + IR2_OPND src2_copy = src2; + IR2_OPND lhs = ra_alloc_itemp(); + IR2_OPND rhs = ra_alloc_itemp(); + IR2_OPND res_lo = ra_alloc_itemp(); + IR2_OPND res_hi = ra_alloc_itemp(); + + if (ir1_opnd_is_mem(opnd2)) { + if (ir1_opnd_size(opnd2) == 128) { + load_freg128_from_ir1_mem(src2, opnd2); + } else { + load_freg256_from_ir1_mem(src2, opnd2); } - s2++; } - IR2_OPND temp_mem = ra_alloc_ftemp(); - IR2_OPND src = ra_alloc_xmm(s2); - la_xvor_v(temp_mem, src, src); - if (ir1_opnd_size(opnd2) == 128) { - load_freg128_from_ir1_mem(src, opnd2); + if (s0 == s1) { + src1_copy = ra_alloc_ftemp(); + la_xvori_b(src1_copy, src1, 0); + } + if (!ir1_opnd_is_mem(opnd2) && s0 == ir1_opnd_base_reg_num(opnd2)) { + src2_copy = ra_alloc_ftemp(); + la_xvori_b(src2_copy, src2, 0); + } + + la_xvxor_v(dest, dest, dest); + + la_xvpickve2gr_d(lhs, src1_copy, (ctrl & 1) ? 1 : 0); + la_xvpickve2gr_d(rhs, src2_copy, (ctrl & 0x10) ? 1 : 0); + emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi); + la_xvinsgr2vr_d(dest, res_lo, 0); + la_xvinsgr2vr_d(dest, res_hi, 1); + + la_xvpickve2gr_d(lhs, src1_copy, (ctrl & 1) ? 3 : 2); + la_xvpickve2gr_d(rhs, src2_copy, (ctrl & 0x10) ? 3 : 2); + emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi); + la_xvinsgr2vr_d(dest, res_lo, 2); + la_xvinsgr2vr_d(dest, res_hi, 3); + + ra_free_temp_auto(src2); + } else { + IR2_OPND src1 = ra_alloc_xmm(s1); + IR2_OPND src2; + IR2_OPND temp = ra_alloc_ftemp(); + IR2_OPND ctrlp = ra_alloc_itemp(); + IR2_OPND lhs = ra_alloc_itemp(); + IR2_OPND rhs = ra_alloc_itemp(); + IR2_OPND res_lo = ra_alloc_itemp(); + IR2_OPND res_hi = ra_alloc_itemp(); + IR2_OPND ftemp = ra_alloc_ftemp(); + + if (!ir1_opnd_is_mem(opnd2)) { + src2 = ra_alloc_xmm(ir1_opnd_base_reg_num(opnd2)); } else { - load_freg256_from_ir1_mem(src, opnd2); + src2 = ra_alloc_ftemp(); + load_freg128_from_ir1_mem(src2, opnd2); } - tr_gen_call_to_helper_pclmulqdq((ADDR)helper_func, s0, s1, s2, ctrl, 0, - helper_kind); - la_xvor_v(src, temp_mem, temp_mem); + + li_d(ctrlp, ctrl); + la_andi(lhs, ctrlp, 1); + la_vreplve_d(ftemp, src1, lhs); + la_vpickve2gr_d(lhs, ftemp, 0); + la_bstrpick_d(rhs, ctrlp, 4, 4); + la_vreplve_d(ftemp, src2, rhs); + la_vpickve2gr_d(rhs, ftemp, 0); + + emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi); + la_vxor_v(temp, temp, temp); + la_vinsgr2vr_d(temp, res_lo, 0); + la_vinsgr2vr_d(temp, res_hi, 1); + set_high128_xreg_to_zero(temp); + la_xvori_b(dest, temp, 0); + ra_free_temp_auto(src2); } - if (ir1_opnd_size(opnd2) == 128) + if (ir1_opnd_size(opnd2) == 128) { set_high128_xreg_to_zero(dest); + } return true; } diff --git a/target/i386/latx/translator/tr-simd.c b/target/i386/latx/translator/tr-simd.c index 3421ab40fe..c358108f3c 100644 --- a/target/i386/latx/translator/tr-simd.c +++ b/target/i386/latx/translator/tr-simd.c @@ -4,6 +4,7 @@ #include "translate.h" #include "hbr.h" #include "tr-vpaes.h" +#include "pclmul.h" bool translate_por(IR1_INST *pir1) { @@ -3707,52 +3708,6 @@ bool translate_pcmpistrm(IR1_INST *pir1) return true; } -static void cal_pclmulqdq(IR2_OPND d, IR2_OPND v, IR2_OPND s, uint8_t ctrl) -{ - IR2_OPND ctrlp = ra_alloc_itemp(); - IR2_OPND a = ra_alloc_itemp();// a = al - IR2_OPND b = ra_alloc_itemp(); - IR2_OPND ftemp = ra_alloc_ftemp(); - - /* 选取操作数 */ - li_d(ctrlp, ctrl); - la_andi(a, ctrlp, 1);// ((ctrl 1) != 0) - la_vreplve_d( ftemp, v, a); - la_vpickve2gr_d( a, ftemp, 0); - la_bstrpick_d( b, ctrlp, 4, 4); - la_vreplve_d( ftemp, s, b); - la_vpickve2gr_d( b, ftemp, 0); - ra_free_temp(ctrlp); - - IR2_OPND ah = ra_alloc_itemp(); - IR2_OPND resl = ra_alloc_itemp(); - IR2_OPND resh = ra_alloc_itemp(); - IR2_OPND all_label = ra_alloc_label(); - IR2_OPND bit_label = ra_alloc_label(); - IR2_OPND end_label = ra_alloc_label(); - - /* 开始运算 */ - la_and( resl, resl, zero_ir2_opnd); - la_and( resh, resh, zero_ir2_opnd); - la_and( ah, ah, zero_ir2_opnd); - la_beqz( b, end_label); - la_label( all_label); - la_andi( a0_ir2_opnd, b, 1);// b 1 - la_srli_d( b, b, 1); - la_beqz(a0_ir2_opnd, bit_label); - la_xor( resl, resl, a); - la_xor( resh, resh, ah); - la_label( bit_label); - la_slli_d(a1_ir2_opnd, ah, 1); - la_bstrpick_d(ah, a, 63, 63); - la_or( ah, a1_ir2_opnd, ah); - la_slli_d( a, a, 0x1); - la_bnez( b, all_label); - la_label( end_label); - la_vinsgr2vr_d( d, resl, 0); - la_vinsgr2vr_d( d, resh, 1); -} - bool translate_pclmulqdq(IR1_INST * pir1) { IR1_OPND * opnd0 = ir1_get_opnd(pir1, 0); IR1_OPND * opnd1 = ir1_get_opnd(pir1, 1); @@ -3762,16 +3717,40 @@ bool translate_pclmulqdq(IR1_INST * pir1) { uint8_t ctrl = ir1_opnd_uimm(opnd2); IR2_OPND dest = ra_alloc_xmm(s0); IR2_OPND src; + IR2_OPND dest_copy = dest; + IR2_OPND ctrlp = ra_alloc_itemp(); + IR2_OPND lhs = ra_alloc_itemp(); + IR2_OPND rhs = ra_alloc_itemp(); + IR2_OPND res_lo = ra_alloc_itemp(); + IR2_OPND res_hi = ra_alloc_itemp(); + IR2_OPND ftemp = ra_alloc_ftemp(); if (!ir1_opnd_is_mem(opnd1)) { int s1 = ir1_opnd_base_reg_num(opnd1); src = ra_alloc_xmm(s1); + if (s0 == s1) { + dest_copy = ra_alloc_ftemp(); + la_vori_b(dest_copy, dest, 0); + src = dest_copy; + } } else { src = ra_alloc_ftemp(); load_freg128_from_ir1_mem(src, opnd1); } - cal_pclmulqdq(dest, dest, src, ctrl); + li_d(ctrlp, ctrl); + la_andi(lhs, ctrlp, 1); + la_vreplve_d(ftemp, dest_copy, lhs); + la_vpickve2gr_d(lhs, ftemp, 0); + la_bstrpick_d(rhs, ctrlp, 4, 4); + la_vreplve_d(ftemp, src, rhs); + la_vpickve2gr_d(rhs, ftemp, 0); + + emit_pclmul_ctz_loop(lhs, rhs, res_lo, res_hi); + la_vxor_v(dest, dest, dest); + la_vinsgr2vr_d(dest, res_lo, 0); + la_vinsgr2vr_d(dest, res_hi, 1); + ra_free_temp_auto(src); return true; }