// Code generated by command: go run gen.go -out ../asm_amd64.s -stubs ../asm_amd64.go -pkg=minlz. DO NOT EDIT. //go:build !appengine && !noasm && gc && !purego #include "textflag.h" // func _dummy_() TEXT ·_dummy_(SB), $0 #ifdef GOAMD64_v4 #ifndef GOAMD64_v3 #define GOAMD64_v3 #endif #endif RET // func encodeBlockAsm(dst []byte, src []byte, tmp *[131072]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000400, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm MOVQ (BX)(DX*1), DI LEAL -2162685(DX), R8 MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R10 MOVQ DI, R11 MOVQ DI, R12 SHRQ $0x08, R12 SHLQ $0x10, R11 IMULQ R10, R11 SHRQ $0x31, R11 SHLQ $0x10, R12 IMULQ R10, R12 SHRQ $0x31, R12 MOVL (AX)(R11*4), SI MOVL (AX)(R12*4), R9 MOVL DX, (AX)(R11*4) MOVL DX, (AX)(R12*4) MOVQ DI, R11 SHRQ $0x10, R11 SHLQ $0x10, R11 IMULQ R10, R11 SHRQ $0x31, R11 MOVL DX, R10 SUBL 16(SP), R10 MOVL 1(BX)(R10*1), R12 MOVQ DI, R10 SHRQ $0x08, R10 CMPL R10, R12 JNE no_repeat_found_encodeBlockAsm LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm repeat_extend_back_loop_encodeBlockAsm: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm repeat_extend_back_end_encodeBlockAsm: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 4(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm CMPL R9, $0x00010000 JB three_bytes_repeat_emit_lits_encodeBlockAsm MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm three_bytes_repeat_emit_lits_encodeBlockAsm: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm two_bytes_repeat_emit_lits_encodeBlockAsm: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm JMP memmove_long_repeat_emit_lits_encodeBlockAsm one_byte_repeat_emit_lits_encodeBlockAsm: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm memmove_midrepeat_emit_lits_encodeBlockAsm: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm memmove_long_repeat_emit_lits_encodeBlockAsm: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm matchlen_loopback_16_repeat_extend_encodeBlockAsm: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm JMP matchlen_match8_repeat_extend_encodeBlockAsm matchlen_bsf_16repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_match8_repeat_extend_encodeBlockAsm: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm matchlen_bsf_8_repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_match4_repeat_extend_encodeBlockAsm: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm JB repeat_extend_forward_end_encodeBlockAsm MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm matchlen_match1_repeat_extend_encodeBlockAsm: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm repeat_three_match_repeat_encodeBlockAsm: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_match_repeat_encodeBlockAsm: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm repeat_one_match_repeat_encodeBlockAsm: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm no_repeat_found_encodeBlockAsm: CMPL SI, R8 JLE offset_ok_0_encodeBlockAsm CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm offset_ok_0_encodeBlockAsm: SHRQ $0x08, DI MOVL (AX)(R11*4), SI LEAL 2(DX), R10 CMPL R9, R8 JLE offset_ok_1_encodeBlockAsm CMPL (BX)(R9*1), DI JEQ candidate2_match_encodeBlockAsm offset_ok_1_encodeBlockAsm: MOVL R10, (AX)(R11*4) SHRQ $0x08, DI CMPL SI, R8 JLE offset_ok_2_encodeBlockAsm CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm offset_ok_2_encodeBlockAsm: MOVL 20(SP), DX JMP search_loop_encodeBlockAsm candidate3_match_encodeBlockAsm: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm candidate2_match_encodeBlockAsm: MOVL R10, (AX)(R11*4) INCL DX MOVL R9, SI candidate_match_encodeBlockAsm: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm match_extend_back_loop_encodeBlockAsm: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm JMP match_extend_back_loop_encodeBlockAsm match_extend_back_end_encodeBlockAsm: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm matchlen_loopback_16_match_nolit_encodeBlockAsm: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm JMP matchlen_match8_match_nolit_encodeBlockAsm matchlen_bsf_16match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm matchlen_match8_match_nolit_encodeBlockAsm: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm matchlen_bsf_8_match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm matchlen_match4_match_nolit_encodeBlockAsm: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm JB match_nolit_end_encodeBlockAsm MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm matchlen_match1_match_nolit_encodeBlockAsm: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm MOVL (DI), DI CMPL SI, $0x0001003f JBE match_emit_copy2lits_encodeBlockAsm // emitCopy3 LEAL -4(R11), R11 LEAL -65536(SI), SI SHLL $0x0b, SI LEAL 7(SI)(R8*8), SI CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBlockAsm LEAL -60(R11), R9 CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBlockAsm CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBlockAsm ADDL $0x000007e0, SI MOVL SI, (CX) MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBlockAsm emit_copy3_2_match_emit_lits_encodeBlockAsm: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBlockAsm emit_copy3_1_match_emit_lits_encodeBlockAsm: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBlockAsm emit_copy3_0_match_emit_lits_encodeBlockAsm: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBlockAsm: MOVL DI, (CX) ADDQ R8, CX JMP match_nolit_emitcopy_end_encodeBlockAsm match_emit_copy2lits_encodeBlockAsm: // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_emit_repeat_copy2_encodeBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_emit_repeat_copy2_encodeBlockAsm: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_one_match_emit_repeat_copy2_encodeBlockAsm: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm match_emit_lits_copy_encodeBlockAsm: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBlockAsm MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm three_bytes_match_emit_encodeBlockAsm: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm two_bytes_match_emit_encodeBlockAsm: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm JMP memmove_long_match_emit_encodeBlockAsm one_byte_match_emit_encodeBlockAsm: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm memmove_midmatch_emit_encodeBlockAsm: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm memmove_long_match_emit_encodeBlockAsm: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm: // emitCopy CMPL SI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBlockAsm // emitCopy3 LEAL -4(R11), R11 LEAL -65536(SI), SI SHLL $0x0b, SI ADDL $0x07, SI CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBlockAsm_emit3 LEAL -60(R11), DI CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBlockAsm_emit3 CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBlockAsm_emit3 ADDL $0x000007e0, SI MOVL SI, (CX) MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy3_2_match_nolit_encodeBlockAsm_emit3: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy3_1_match_nolit_encodeBlockAsm_emit3: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy3_0_match_nolit_encodeBlockAsm_emit3: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_match_nolit_encodeBlockAsm: CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_one_longer_match_nolit_encodeBlockAsm: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy1_repeat_match_nolit_encodeBlockAsm: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_match_nolit_encodeBlockAsm: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy2_2_match_nolit_encodeBlockAsm_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy2_1_match_nolit_encodeBlockAsm_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy2_0_match_nolit_encodeBlockAsm_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x10, R8 IMULQ SI, R8 SHRQ $0x31, R8 SHLQ $0x10, R9 IMULQ SI, R9 SHRQ $0x31, R9 LEAL -2(DX), R10 MOVL (AX)(R9*4), SI MOVL R10, (AX)(R8*4) MOVL DX, (AX)(R9*4) MOVL DX, R8 INCL DX LEAL -2162687(R8), R9 CMPL SI, R9 JA match_nolit_len_okencodeBlockAsm JMP search_loop_encodeBlockAsm match_nolit_len_okencodeBlockAsm: CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm matchlen_loopback_16_match_nolit2_encodeBlockAsm: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm JMP matchlen_match8_match_nolit2_encodeBlockAsm matchlen_bsf_16match_nolit2_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm matchlen_match8_match_nolit2_encodeBlockAsm: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm matchlen_bsf_8_match_nolit2_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm matchlen_match4_match_nolit2_encodeBlockAsm: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm JB match_nolit2_end_encodeBlockAsm MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm matchlen_match1_match_nolit2_encodeBlockAsm: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm emit_remainder_encodeBlockAsm: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm CMPL BX, $0x00010000 JB three_bytes_emit_remainder_encodeBlockAsm MOVL BX, SI SHRL $0x10, SI MOVB $0xf8, (CX) MOVW BX, 1(CX) MOVB SI, 3(CX) ADDQ $0x04, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm three_bytes_emit_remainder_encodeBlockAsm: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm two_bytes_emit_remainder_encodeBlockAsm: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm JMP memmove_long_emit_remainder_encodeBlockAsm one_byte_emit_remainder_encodeBlockAsm: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm memmove_midemit_remainder_encodeBlockAsm: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm memmove_long_emit_remainder_encodeBlockAsm: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm2MB(dst []byte, src []byte, tmp *[131072]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm2MB(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000400, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm2MB: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm2MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm2MB: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm2MB MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x31, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm2MB LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm2MB repeat_extend_back_loop_encodeBlockAsm2MB: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm2MB MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm2MB LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm2MB repeat_extend_back_end_encodeBlockAsm2MB: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 4(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm2MB SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm2MB CMPL R9, $0x00010000 JB three_bytes_repeat_emit_lits_encodeBlockAsm2MB MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB three_bytes_repeat_emit_lits_encodeBlockAsm2MB: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB two_bytes_repeat_emit_lits_encodeBlockAsm2MB: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm2MB JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB one_byte_repeat_emit_lits_encodeBlockAsm2MB: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm2MB memmove_midrepeat_emit_lits_encodeBlockAsm2MB: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm2MB memmove_long_repeat_emit_lits_encodeBlockAsm2MB: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm2MB: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm2MB LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB JMP matchlen_match8_repeat_extend_encodeBlockAsm2MB matchlen_bsf_16repeat_extend_encodeBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm2MB matchlen_match8_repeat_extend_encodeBlockAsm2MB: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm2MB MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm2MB matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm2MB matchlen_match4_repeat_extend_encodeBlockAsm2MB: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm2MB MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm2MB LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm2MB: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm2MB JB repeat_extend_forward_end_encodeBlockAsm2MB MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm2MB LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm2MB matchlen_match1_repeat_extend_encodeBlockAsm2MB: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm2MB LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm2MB: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm2MB LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm2MB CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm2MB MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm2MB repeat_three_match_repeat_encodeBlockAsm2MB: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm2MB repeat_two_match_repeat_encodeBlockAsm2MB: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm2MB repeat_one_match_repeat_encodeBlockAsm2MB: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm2MB: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm2MB no_repeat_found_encodeBlockAsm2MB: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm2MB SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm2MB MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm2MB MOVL 20(SP), DX JMP search_loop_encodeBlockAsm2MB candidate3_match_encodeBlockAsm2MB: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm2MB candidate2_match_encodeBlockAsm2MB: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm2MB: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm2MB match_extend_back_loop_encodeBlockAsm2MB: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm2MB MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm2MB LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm2MB JMP match_extend_back_loop_encodeBlockAsm2MB match_extend_back_end_encodeBlockAsm2MB: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB matchlen_loopback_16_match_nolit_encodeBlockAsm2MB: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm2MB XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm2MB LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm2MB JMP matchlen_match8_match_nolit_encodeBlockAsm2MB matchlen_bsf_16match_nolit_encodeBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm2MB matchlen_match8_match_nolit_encodeBlockAsm2MB: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm2MB MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm2MB LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm2MB matchlen_bsf_8_match_nolit_encodeBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm2MB matchlen_match4_match_nolit_encodeBlockAsm2MB: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm2MB MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm2MB LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm2MB: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm2MB JB match_nolit_end_encodeBlockAsm2MB MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm2MB LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm2MB matchlen_match1_match_nolit_encodeBlockAsm2MB: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm2MB LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm2MB: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm2MB LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm2MB CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm2MB MOVL (DI), DI CMPL SI, $0x0001003f JBE match_emit_copy2lits_encodeBlockAsm2MB // emitCopy3 LEAL -4(R11), R11 LEAL -65536(SI), SI SHLL $0x0b, SI LEAL 7(SI)(R8*8), SI CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBlockAsm2MB LEAL -60(R11), R9 CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBlockAsm2MB CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBlockAsm2MB ADDL $0x000007e0, SI MOVL SI, (CX) MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBlockAsm2MB emit_copy3_2_match_emit_lits_encodeBlockAsm2MB: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBlockAsm2MB emit_copy3_1_match_emit_lits_encodeBlockAsm2MB: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBlockAsm2MB emit_copy3_0_match_emit_lits_encodeBlockAsm2MB: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBlockAsm2MB: MOVL DI, (CX) ADDQ R8, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB match_emit_copy2lits_encodeBlockAsm2MB: // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm2MB // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm2MB CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm2MB MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB repeat_three_match_emit_repeat_copy2_encodeBlockAsm2MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB repeat_two_match_emit_repeat_copy2_encodeBlockAsm2MB: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB match_emit_lits_copy_encodeBlockAsm2MB: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm2MB SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm2MB CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBlockAsm2MB MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm2MB three_bytes_match_emit_encodeBlockAsm2MB: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm2MB two_bytes_match_emit_encodeBlockAsm2MB: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm2MB JMP memmove_long_match_emit_encodeBlockAsm2MB one_byte_match_emit_encodeBlockAsm2MB: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm2MB emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm2MB emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm2MB: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm2MB memmove_midmatch_emit_encodeBlockAsm2MB: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm2MB emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm2MB: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm2MB memmove_long_match_emit_encodeBlockAsm2MB: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm2MB: // emitCopy CMPL SI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBlockAsm2MB // emitCopy3 LEAL -4(R11), R11 LEAL -65536(SI), SI SHLL $0x0b, SI ADDL $0x07, SI CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBlockAsm2MB_emit3 LEAL -60(R11), DI CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBlockAsm2MB_emit3 CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBlockAsm2MB_emit3 ADDL $0x000007e0, SI MOVL SI, (CX) MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy3_2_match_nolit_encodeBlockAsm2MB_emit3: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy3_1_match_nolit_encodeBlockAsm2MB_emit3: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy3_0_match_nolit_encodeBlockAsm2MB_emit3: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB two_byte_offset_match_nolit_encodeBlockAsm2MB: CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm2MB CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm2MB LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_one_longer_match_nolit_encodeBlockAsm2MB: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm2MB LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy1_repeat_match_nolit_encodeBlockAsm2MB: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB two_byte_match_nolit_encodeBlockAsm2MB: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm2MB_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm2MB_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm2MB_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy2_2_match_nolit_encodeBlockAsm2MB_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy2_1_match_nolit_encodeBlockAsm2MB_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm2MB emit_copy2_0_match_nolit_encodeBlockAsm2MB_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm2MB: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm2MB MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm2MB MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm2MB: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x10, R8 IMULQ SI, R8 SHRQ $0x31, R8 SHLQ $0x10, R9 IMULQ SI, R9 SHRQ $0x31, R9 LEAL -2(DX), R10 MOVL (AX)(R9*4), SI MOVL R10, (AX)(R8*4) MOVL DX, (AX)(R9*4) MOVL DX, R8 INCL DX CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm2MB MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm2MB LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB JMP matchlen_match8_match_nolit2_encodeBlockAsm2MB matchlen_bsf_16match_nolit2_encodeBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm2MB matchlen_match8_match_nolit2_encodeBlockAsm2MB: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm2MB MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm2MB matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm2MB matchlen_match4_match_nolit2_encodeBlockAsm2MB: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm2MB MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm2MB LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm2MB: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm2MB JB match_nolit2_end_encodeBlockAsm2MB MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm2MB LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm2MB matchlen_match1_match_nolit2_encodeBlockAsm2MB: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm2MB LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm2MB: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm2MB emit_remainder_encodeBlockAsm2MB: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm2MB LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm2MB SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm2MB CMPL BX, $0x00010000 JB three_bytes_emit_remainder_encodeBlockAsm2MB MOVL BX, SI SHRL $0x10, SI MOVB $0xf8, (CX) MOVW BX, 1(CX) MOVB SI, 3(CX) ADDQ $0x04, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm2MB three_bytes_emit_remainder_encodeBlockAsm2MB: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm2MB two_bytes_emit_remainder_encodeBlockAsm2MB: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm2MB JMP memmove_long_emit_remainder_encodeBlockAsm2MB one_byte_emit_remainder_encodeBlockAsm2MB: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm2MB: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm2MB memmove_midemit_remainder_encodeBlockAsm2MB: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm2MB emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm2MB: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm2MB memmove_long_emit_remainder_encodeBlockAsm2MB: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm2MB: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm512K(dst []byte, src []byte, tmp *[65536]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm512K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm512K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm512K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm512K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm512K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm512K LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm512K repeat_extend_back_loop_encodeBlockAsm512K: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm512K MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm512K LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm512K repeat_extend_back_end_encodeBlockAsm512K: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 4(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm512K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm512K CMPL R9, $0x00010000 JB three_bytes_repeat_emit_lits_encodeBlockAsm512K MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K three_bytes_repeat_emit_lits_encodeBlockAsm512K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K two_bytes_repeat_emit_lits_encodeBlockAsm512K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm512K JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K one_byte_repeat_emit_lits_encodeBlockAsm512K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm512K memmove_midrepeat_emit_lits_encodeBlockAsm512K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm512K memmove_long_repeat_emit_lits_encodeBlockAsm512K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm512K: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K matchlen_loopback_16_repeat_extend_encodeBlockAsm512K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm512K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm512K LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm512K JMP matchlen_match8_repeat_extend_encodeBlockAsm512K matchlen_bsf_16repeat_extend_encodeBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm512K matchlen_match8_repeat_extend_encodeBlockAsm512K: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm512K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm512K LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm512K matchlen_bsf_8_repeat_extend_encodeBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm512K matchlen_match4_repeat_extend_encodeBlockAsm512K: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm512K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm512K LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm512K: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm512K JB repeat_extend_forward_end_encodeBlockAsm512K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm512K LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm512K matchlen_match1_repeat_extend_encodeBlockAsm512K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm512K LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm512K: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm512K LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm512K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm512K MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm512K repeat_three_match_repeat_encodeBlockAsm512K: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm512K repeat_two_match_repeat_encodeBlockAsm512K: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm512K repeat_one_match_repeat_encodeBlockAsm512K: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm512K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm512K no_repeat_found_encodeBlockAsm512K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm512K SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm512K MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm512K MOVL 20(SP), DX JMP search_loop_encodeBlockAsm512K candidate3_match_encodeBlockAsm512K: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm512K candidate2_match_encodeBlockAsm512K: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm512K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm512K match_extend_back_loop_encodeBlockAsm512K: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm512K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm512K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm512K JMP match_extend_back_loop_encodeBlockAsm512K match_extend_back_end_encodeBlockAsm512K: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K matchlen_loopback_16_match_nolit_encodeBlockAsm512K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm512K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm512K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm512K JMP matchlen_match8_match_nolit_encodeBlockAsm512K matchlen_bsf_16match_nolit_encodeBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm512K matchlen_match8_match_nolit_encodeBlockAsm512K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm512K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm512K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm512K matchlen_bsf_8_match_nolit_encodeBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm512K matchlen_match4_match_nolit_encodeBlockAsm512K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm512K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm512K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm512K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm512K JB match_nolit_end_encodeBlockAsm512K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm512K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm512K matchlen_match1_match_nolit_encodeBlockAsm512K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm512K LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm512K: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm512K LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm512K CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm512K MOVL (DI), DI CMPL SI, $0x0001003f JBE match_emit_copy2lits_encodeBlockAsm512K // emitCopy3 LEAL -4(R11), R11 LEAL -65536(SI), SI SHLL $0x0b, SI LEAL 7(SI)(R8*8), SI CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBlockAsm512K LEAL -60(R11), R9 CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBlockAsm512K CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBlockAsm512K ADDL $0x000007e0, SI MOVL SI, (CX) MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBlockAsm512K emit_copy3_2_match_emit_lits_encodeBlockAsm512K: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBlockAsm512K emit_copy3_1_match_emit_lits_encodeBlockAsm512K: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBlockAsm512K emit_copy3_0_match_emit_lits_encodeBlockAsm512K: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBlockAsm512K: MOVL DI, (CX) ADDQ R8, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K match_emit_copy2lits_encodeBlockAsm512K: // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm512K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm512K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm512K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K repeat_three_match_emit_repeat_copy2_encodeBlockAsm512K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K repeat_two_match_emit_repeat_copy2_encodeBlockAsm512K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K match_emit_lits_copy_encodeBlockAsm512K: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm512K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm512K CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBlockAsm512K MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm512K three_bytes_match_emit_encodeBlockAsm512K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm512K two_bytes_match_emit_encodeBlockAsm512K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm512K JMP memmove_long_match_emit_encodeBlockAsm512K one_byte_match_emit_encodeBlockAsm512K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm512K emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm512K emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm512K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm512K memmove_midmatch_emit_encodeBlockAsm512K: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm512K emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm512K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm512K memmove_long_match_emit_encodeBlockAsm512K: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm512K: // emitCopy CMPL SI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBlockAsm512K // emitCopy3 LEAL -4(R11), R11 LEAL -65536(SI), SI SHLL $0x0b, SI ADDL $0x07, SI CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBlockAsm512K_emit3 LEAL -60(R11), DI CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBlockAsm512K_emit3 CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBlockAsm512K_emit3 ADDL $0x000007e0, SI MOVL SI, (CX) MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy3_2_match_nolit_encodeBlockAsm512K_emit3: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy3_1_match_nolit_encodeBlockAsm512K_emit3: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy3_0_match_nolit_encodeBlockAsm512K_emit3: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K two_byte_offset_match_nolit_encodeBlockAsm512K: CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm512K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm512K LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_one_longer_match_nolit_encodeBlockAsm512K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm512K LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy1_repeat_match_nolit_encodeBlockAsm512K: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K two_byte_match_nolit_encodeBlockAsm512K: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm512K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm512K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm512K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy2_2_match_nolit_encodeBlockAsm512K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy2_1_match_nolit_encodeBlockAsm512K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm512K emit_copy2_0_match_nolit_encodeBlockAsm512K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm512K: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm512K MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm512K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm512K: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x10, R8 IMULQ SI, R8 SHRQ $0x32, R8 SHLQ $0x10, R9 IMULQ SI, R9 SHRQ $0x32, R9 LEAL -2(DX), R10 MOVL (AX)(R9*4), SI MOVL R10, (AX)(R8*4) MOVL DX, (AX)(R9*4) MOVL DX, R8 INCL DX CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm512K MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K matchlen_loopback_16_match_nolit2_encodeBlockAsm512K: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm512K XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm512K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm512K JMP matchlen_match8_match_nolit2_encodeBlockAsm512K matchlen_bsf_16match_nolit2_encodeBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm512K matchlen_match8_match_nolit2_encodeBlockAsm512K: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm512K MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm512K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm512K matchlen_bsf_8_match_nolit2_encodeBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm512K matchlen_match4_match_nolit2_encodeBlockAsm512K: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm512K MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm512K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm512K: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm512K JB match_nolit2_end_encodeBlockAsm512K MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm512K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm512K matchlen_match1_match_nolit2_encodeBlockAsm512K: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm512K LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm512K: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm512K emit_remainder_encodeBlockAsm512K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm512K LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm512K SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm512K CMPL BX, $0x00010000 JB three_bytes_emit_remainder_encodeBlockAsm512K MOVL BX, SI SHRL $0x10, SI MOVB $0xf8, (CX) MOVW BX, 1(CX) MOVB SI, 3(CX) ADDQ $0x04, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm512K three_bytes_emit_remainder_encodeBlockAsm512K: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm512K two_bytes_emit_remainder_encodeBlockAsm512K: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm512K JMP memmove_long_emit_remainder_encodeBlockAsm512K one_byte_emit_remainder_encodeBlockAsm512K: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm512K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm512K memmove_midemit_remainder_encodeBlockAsm512K: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm512K emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm512K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm512K memmove_long_emit_remainder_encodeBlockAsm512K: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm512K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm64K(dst []byte, src []byte, tmp *[16384]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm64K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000080, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm64K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm64K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm64K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm64K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x33, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x33, R11 MOVWLZX (AX)(R10*2), SI MOVWLZX (AX)(R11*2), R8 MOVW DX, (AX)(R10*2) MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x33, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm64K LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm64K repeat_extend_back_loop_encodeBlockAsm64K: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm64K MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm64K LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm64K repeat_extend_back_end_encodeBlockAsm64K: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 4(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm64K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm64K JB three_bytes_repeat_emit_lits_encodeBlockAsm64K MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K three_bytes_repeat_emit_lits_encodeBlockAsm64K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K two_bytes_repeat_emit_lits_encodeBlockAsm64K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm64K JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K one_byte_repeat_emit_lits_encodeBlockAsm64K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm64K memmove_midrepeat_emit_lits_encodeBlockAsm64K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm64K memmove_long_repeat_emit_lits_encodeBlockAsm64K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm64K: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K matchlen_loopback_16_repeat_extend_encodeBlockAsm64K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm64K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm64K LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm64K JMP matchlen_match8_repeat_extend_encodeBlockAsm64K matchlen_bsf_16repeat_extend_encodeBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm64K matchlen_match8_repeat_extend_encodeBlockAsm64K: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm64K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm64K LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm64K matchlen_bsf_8_repeat_extend_encodeBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm64K matchlen_match4_repeat_extend_encodeBlockAsm64K: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm64K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm64K LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm64K: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm64K JB repeat_extend_forward_end_encodeBlockAsm64K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm64K LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm64K matchlen_match1_repeat_extend_encodeBlockAsm64K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm64K LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm64K: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm64K LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm64K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm64K MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm64K repeat_three_match_repeat_encodeBlockAsm64K: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm64K repeat_two_match_repeat_encodeBlockAsm64K: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm64K repeat_one_match_repeat_encodeBlockAsm64K: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm64K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm64K no_repeat_found_encodeBlockAsm64K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm64K SHRQ $0x08, DI MOVWLZX (AX)(R10*2), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm64K MOVW R9, (AX)(R10*2) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm64K MOVL 20(SP), DX JMP search_loop_encodeBlockAsm64K candidate3_match_encodeBlockAsm64K: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm64K candidate2_match_encodeBlockAsm64K: MOVW R9, (AX)(R10*2) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm64K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm64K match_extend_back_loop_encodeBlockAsm64K: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm64K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm64K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm64K JMP match_extend_back_loop_encodeBlockAsm64K match_extend_back_end_encodeBlockAsm64K: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K matchlen_loopback_16_match_nolit_encodeBlockAsm64K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm64K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm64K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm64K JMP matchlen_match8_match_nolit_encodeBlockAsm64K matchlen_bsf_16match_nolit_encodeBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm64K matchlen_match8_match_nolit_encodeBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm64K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm64K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm64K matchlen_bsf_8_match_nolit_encodeBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm64K matchlen_match4_match_nolit_encodeBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm64K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm64K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm64K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm64K JB match_nolit_end_encodeBlockAsm64K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm64K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm64K matchlen_match1_match_nolit_encodeBlockAsm64K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm64K LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm64K: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm64K LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm64K CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm64K MOVL (DI), DI // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm64K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm64K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm64K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K repeat_three_match_emit_repeat_copy2_encodeBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K repeat_two_match_emit_repeat_copy2_encodeBlockAsm64K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K match_emit_lits_copy_encodeBlockAsm64K: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm64K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm64K JB three_bytes_match_emit_encodeBlockAsm64K MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm64K three_bytes_match_emit_encodeBlockAsm64K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm64K two_bytes_match_emit_encodeBlockAsm64K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm64K JMP memmove_long_match_emit_encodeBlockAsm64K one_byte_match_emit_encodeBlockAsm64K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm64K emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm64K emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm64K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm64K memmove_midmatch_emit_encodeBlockAsm64K: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm64K emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm64K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm64K memmove_long_match_emit_encodeBlockAsm64K: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm64K: // emitCopy CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm64K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm64K LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K emit_one_longer_match_nolit_encodeBlockAsm64K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm64K LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K emit_copy1_repeat_match_nolit_encodeBlockAsm64K: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K two_byte_match_nolit_encodeBlockAsm64K: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm64K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm64K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm64K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K emit_copy2_2_match_nolit_encodeBlockAsm64K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K emit_copy2_1_match_nolit_encodeBlockAsm64K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm64K emit_copy2_0_match_nolit_encodeBlockAsm64K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm64K: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm64K MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm64K: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x10, R8 IMULQ SI, R8 SHRQ $0x33, R8 SHLQ $0x10, R9 IMULQ SI, R9 SHRQ $0x33, R9 LEAL -2(DX), R10 MOVWLZX (AX)(R9*2), SI MOVW R10, (AX)(R8*2) MOVW DX, (AX)(R9*2) MOVL DX, R8 INCL DX CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm64K MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K matchlen_loopback_16_match_nolit2_encodeBlockAsm64K: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm64K XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm64K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm64K JMP matchlen_match8_match_nolit2_encodeBlockAsm64K matchlen_bsf_16match_nolit2_encodeBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm64K matchlen_match8_match_nolit2_encodeBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm64K MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm64K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm64K matchlen_bsf_8_match_nolit2_encodeBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm64K matchlen_match4_match_nolit2_encodeBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm64K MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm64K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm64K: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm64K JB match_nolit2_end_encodeBlockAsm64K MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm64K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm64K matchlen_match1_match_nolit2_encodeBlockAsm64K: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm64K LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm64K: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm64K emit_remainder_encodeBlockAsm64K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm64K LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm64K SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm64K JB three_bytes_emit_remainder_encodeBlockAsm64K MOVL BX, SI SHRL $0x10, SI MOVB $0xf8, (CX) MOVW BX, 1(CX) MOVB SI, 3(CX) ADDQ $0x04, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm64K three_bytes_emit_remainder_encodeBlockAsm64K: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm64K two_bytes_emit_remainder_encodeBlockAsm64K: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm64K JMP memmove_long_emit_remainder_encodeBlockAsm64K one_byte_emit_remainder_encodeBlockAsm64K: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm64K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm64K memmove_midemit_remainder_encodeBlockAsm64K: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm64K emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm64K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm64K memmove_long_emit_remainder_encodeBlockAsm64K: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm64K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm16K(dst []byte, src []byte, tmp *[8192]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm16K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000040, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm16K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm16K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm16K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm16K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x000000cf1bbcdcbb, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 SHLQ $0x18, R11 IMULQ R9, R11 SHRQ $0x34, R11 MOVWLZX (AX)(R10*2), SI MOVWLZX (AX)(R11*2), R8 MOVW DX, (AX)(R10*2) MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm16K LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm16K repeat_extend_back_loop_encodeBlockAsm16K: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm16K MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm16K LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm16K repeat_extend_back_end_encodeBlockAsm16K: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 3(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm16K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm16K JB three_bytes_repeat_emit_lits_encodeBlockAsm16K three_bytes_repeat_emit_lits_encodeBlockAsm16K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm16K two_bytes_repeat_emit_lits_encodeBlockAsm16K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm16K JMP memmove_long_repeat_emit_lits_encodeBlockAsm16K one_byte_repeat_emit_lits_encodeBlockAsm16K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm16K memmove_midrepeat_emit_lits_encodeBlockAsm16K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm16K memmove_long_repeat_emit_lits_encodeBlockAsm16K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm16K: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K matchlen_loopback_16_repeat_extend_encodeBlockAsm16K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm16K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm16K LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm16K JMP matchlen_match8_repeat_extend_encodeBlockAsm16K matchlen_bsf_16repeat_extend_encodeBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm16K matchlen_match8_repeat_extend_encodeBlockAsm16K: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm16K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm16K LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm16K matchlen_bsf_8_repeat_extend_encodeBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm16K matchlen_match4_repeat_extend_encodeBlockAsm16K: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm16K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm16K LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm16K: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm16K JB repeat_extend_forward_end_encodeBlockAsm16K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm16K LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm16K matchlen_match1_repeat_extend_encodeBlockAsm16K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm16K LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm16K: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm16K LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm16K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm16K MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm16K repeat_three_match_repeat_encodeBlockAsm16K: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm16K repeat_two_match_repeat_encodeBlockAsm16K: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm16K repeat_one_match_repeat_encodeBlockAsm16K: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm16K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm16K no_repeat_found_encodeBlockAsm16K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm16K SHRQ $0x08, DI MOVWLZX (AX)(R10*2), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm16K MOVW R9, (AX)(R10*2) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm16K MOVL 20(SP), DX JMP search_loop_encodeBlockAsm16K candidate3_match_encodeBlockAsm16K: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm16K candidate2_match_encodeBlockAsm16K: MOVW R9, (AX)(R10*2) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm16K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm16K match_extend_back_loop_encodeBlockAsm16K: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm16K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm16K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm16K JMP match_extend_back_loop_encodeBlockAsm16K match_extend_back_end_encodeBlockAsm16K: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K matchlen_loopback_16_match_nolit_encodeBlockAsm16K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm16K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm16K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm16K JMP matchlen_match8_match_nolit_encodeBlockAsm16K matchlen_bsf_16match_nolit_encodeBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm16K matchlen_match8_match_nolit_encodeBlockAsm16K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm16K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm16K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm16K matchlen_bsf_8_match_nolit_encodeBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm16K matchlen_match4_match_nolit_encodeBlockAsm16K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm16K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm16K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm16K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm16K JB match_nolit_end_encodeBlockAsm16K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm16K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm16K matchlen_match1_match_nolit_encodeBlockAsm16K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm16K LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm16K: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm16K LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm16K CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm16K MOVL (DI), DI // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm16K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm16K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm16K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K repeat_three_match_emit_repeat_copy2_encodeBlockAsm16K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K repeat_two_match_emit_repeat_copy2_encodeBlockAsm16K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K match_emit_lits_copy_encodeBlockAsm16K: LEAQ 3(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm16K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm16K JB three_bytes_match_emit_encodeBlockAsm16K three_bytes_match_emit_encodeBlockAsm16K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm16K two_bytes_match_emit_encodeBlockAsm16K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm16K JMP memmove_long_match_emit_encodeBlockAsm16K one_byte_match_emit_encodeBlockAsm16K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm16K emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm16K emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm16K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm16K memmove_midmatch_emit_encodeBlockAsm16K: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm16K emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm16K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm16K memmove_long_match_emit_encodeBlockAsm16K: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm16K: // emitCopy CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm16K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm16K LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K emit_one_longer_match_nolit_encodeBlockAsm16K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm16K LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K emit_copy1_repeat_match_nolit_encodeBlockAsm16K: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K two_byte_match_nolit_encodeBlockAsm16K: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm16K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm16K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm16K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K emit_copy2_2_match_nolit_encodeBlockAsm16K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K emit_copy2_1_match_nolit_encodeBlockAsm16K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm16K emit_copy2_0_match_nolit_encodeBlockAsm16K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm16K: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm16K MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm16K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm16K: MOVQ $0x000000cf1bbcdcbb, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x18, R8 IMULQ SI, R8 SHRQ $0x34, R8 SHLQ $0x18, R9 IMULQ SI, R9 SHRQ $0x34, R9 LEAL -2(DX), R10 MOVWLZX (AX)(R9*2), SI MOVW R10, (AX)(R8*2) MOVW DX, (AX)(R9*2) MOVL DX, R8 INCL DX CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm16K MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K matchlen_loopback_16_match_nolit2_encodeBlockAsm16K: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm16K XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm16K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm16K JMP matchlen_match8_match_nolit2_encodeBlockAsm16K matchlen_bsf_16match_nolit2_encodeBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm16K matchlen_match8_match_nolit2_encodeBlockAsm16K: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm16K MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm16K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm16K matchlen_bsf_8_match_nolit2_encodeBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm16K matchlen_match4_match_nolit2_encodeBlockAsm16K: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm16K MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm16K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm16K: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm16K JB match_nolit2_end_encodeBlockAsm16K MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm16K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm16K matchlen_match1_match_nolit2_encodeBlockAsm16K: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm16K LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm16K: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm16K emit_remainder_encodeBlockAsm16K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm16K LEAQ (BX)(DX*1), DX LEAQ 3(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm16K SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm16K JB three_bytes_emit_remainder_encodeBlockAsm16K three_bytes_emit_remainder_encodeBlockAsm16K: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm16K two_bytes_emit_remainder_encodeBlockAsm16K: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm16K JMP memmove_long_emit_remainder_encodeBlockAsm16K one_byte_emit_remainder_encodeBlockAsm16K: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm16K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm16K memmove_midemit_remainder_encodeBlockAsm16K: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm16K emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm16K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm16K memmove_long_emit_remainder_encodeBlockAsm16K: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm16K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm4K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000010, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm4K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm4K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm4K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm4K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x36, R11 MOVWLZX (AX)(R10*2), SI MOVWLZX (AX)(R11*2), R8 MOVW DX, (AX)(R10*2) MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm4K LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm4K repeat_extend_back_loop_encodeBlockAsm4K: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm4K MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm4K LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm4K repeat_extend_back_end_encodeBlockAsm4K: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 3(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm4K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm4K JB three_bytes_repeat_emit_lits_encodeBlockAsm4K three_bytes_repeat_emit_lits_encodeBlockAsm4K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm4K two_bytes_repeat_emit_lits_encodeBlockAsm4K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm4K JMP memmove_long_repeat_emit_lits_encodeBlockAsm4K one_byte_repeat_emit_lits_encodeBlockAsm4K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm4K memmove_midrepeat_emit_lits_encodeBlockAsm4K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm4K memmove_long_repeat_emit_lits_encodeBlockAsm4K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm4K: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K matchlen_loopback_16_repeat_extend_encodeBlockAsm4K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4K LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm4K JMP matchlen_match8_repeat_extend_encodeBlockAsm4K matchlen_bsf_16repeat_extend_encodeBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm4K matchlen_match8_repeat_extend_encodeBlockAsm4K: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm4K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4K LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm4K matchlen_bsf_8_repeat_extend_encodeBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm4K matchlen_match4_repeat_extend_encodeBlockAsm4K: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm4K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm4K LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm4K: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm4K JB repeat_extend_forward_end_encodeBlockAsm4K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm4K LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm4K matchlen_match1_repeat_extend_encodeBlockAsm4K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm4K LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm4K: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm4K LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm4K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm4K MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm4K repeat_three_match_repeat_encodeBlockAsm4K: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm4K repeat_two_match_repeat_encodeBlockAsm4K: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4K repeat_one_match_repeat_encodeBlockAsm4K: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm4K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm4K no_repeat_found_encodeBlockAsm4K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm4K SHRQ $0x08, DI MOVWLZX (AX)(R10*2), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm4K MOVW R9, (AX)(R10*2) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm4K MOVL 20(SP), DX JMP search_loop_encodeBlockAsm4K candidate3_match_encodeBlockAsm4K: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm4K candidate2_match_encodeBlockAsm4K: MOVW R9, (AX)(R10*2) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm4K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm4K match_extend_back_loop_encodeBlockAsm4K: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm4K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm4K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm4K JMP match_extend_back_loop_encodeBlockAsm4K match_extend_back_end_encodeBlockAsm4K: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K matchlen_loopback_16_match_nolit_encodeBlockAsm4K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm4K JMP matchlen_match8_match_nolit_encodeBlockAsm4K matchlen_bsf_16match_nolit_encodeBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm4K matchlen_match8_match_nolit_encodeBlockAsm4K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm4K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm4K matchlen_bsf_8_match_nolit_encodeBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm4K matchlen_match4_match_nolit_encodeBlockAsm4K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm4K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm4K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm4K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm4K JB match_nolit_end_encodeBlockAsm4K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm4K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm4K matchlen_match1_match_nolit_encodeBlockAsm4K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm4K LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm4K: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm4K LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm4K CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm4K MOVL (DI), DI // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm4K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm4K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm4K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K repeat_three_match_emit_repeat_copy2_encodeBlockAsm4K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K repeat_two_match_emit_repeat_copy2_encodeBlockAsm4K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K match_emit_lits_copy_encodeBlockAsm4K: LEAQ 3(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm4K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm4K JB three_bytes_match_emit_encodeBlockAsm4K three_bytes_match_emit_encodeBlockAsm4K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm4K two_bytes_match_emit_encodeBlockAsm4K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm4K JMP memmove_long_match_emit_encodeBlockAsm4K one_byte_match_emit_encodeBlockAsm4K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm4K emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4K emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm4K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm4K memmove_midmatch_emit_encodeBlockAsm4K: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm4K emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm4K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm4K memmove_long_match_emit_encodeBlockAsm4K: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm4K: // emitCopy CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm4K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm4K LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K emit_one_longer_match_nolit_encodeBlockAsm4K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm4K LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K emit_copy1_repeat_match_nolit_encodeBlockAsm4K: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K two_byte_match_nolit_encodeBlockAsm4K: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm4K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm4K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm4K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K emit_copy2_2_match_nolit_encodeBlockAsm4K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K emit_copy2_1_match_nolit_encodeBlockAsm4K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4K emit_copy2_0_match_nolit_encodeBlockAsm4K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm4K: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm4K MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm4K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm4K: MOVQ $0x9e3779b1, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x20, R8 IMULQ SI, R8 SHRQ $0x36, R8 SHLQ $0x20, R9 IMULQ SI, R9 SHRQ $0x36, R9 LEAL -2(DX), R10 MOVWLZX (AX)(R9*2), SI MOVW R10, (AX)(R8*2) MOVW DX, (AX)(R9*2) MOVL DX, R8 INCL DX CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm4K MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K matchlen_loopback_16_match_nolit2_encodeBlockAsm4K: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm4K XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm4K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm4K JMP matchlen_match8_match_nolit2_encodeBlockAsm4K matchlen_bsf_16match_nolit2_encodeBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm4K matchlen_match8_match_nolit2_encodeBlockAsm4K: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm4K MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm4K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm4K matchlen_bsf_8_match_nolit2_encodeBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm4K matchlen_match4_match_nolit2_encodeBlockAsm4K: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm4K MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm4K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm4K: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm4K JB match_nolit2_end_encodeBlockAsm4K MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm4K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm4K matchlen_match1_match_nolit2_encodeBlockAsm4K: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm4K LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm4K: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm4K emit_remainder_encodeBlockAsm4K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm4K LEAQ (BX)(DX*1), DX LEAQ 3(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm4K SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm4K JB three_bytes_emit_remainder_encodeBlockAsm4K three_bytes_emit_remainder_encodeBlockAsm4K: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm4K two_bytes_emit_remainder_encodeBlockAsm4K: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm4K JMP memmove_long_emit_remainder_encodeBlockAsm4K one_byte_emit_remainder_encodeBlockAsm4K: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm4K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm4K memmove_midemit_remainder_encodeBlockAsm4K: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm4K emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm4K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm4K memmove_long_emit_remainder_encodeBlockAsm4K: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm4K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBlockAsm1K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000008, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm1K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm1K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -17(DX), BX LEAQ -17(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm1K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm1K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x37, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x37, R11 MOVWLZX (AX)(R10*2), SI MOVWLZX (AX)(R11*2), R8 MOVW DX, (AX)(R10*2) MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x37, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm1K LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeBlockAsm1K repeat_extend_back_loop_encodeBlockAsm1K: CMPL DI, SI JBE repeat_extend_back_end_encodeBlockAsm1K MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm1K LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeBlockAsm1K repeat_extend_back_end_encodeBlockAsm1K: MOVL DI, SI MOVL 12(SP), R8 SUBL R8, SI LEAQ 3(CX)(SI*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: LEAQ (BX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_repeat_emit_lits_encodeBlockAsm1K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_repeat_emit_lits_encodeBlockAsm1K JB three_bytes_repeat_emit_lits_encodeBlockAsm1K three_bytes_repeat_emit_lits_encodeBlockAsm1K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_repeat_emit_lits_encodeBlockAsm1K two_bytes_repeat_emit_lits_encodeBlockAsm1K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midrepeat_emit_lits_encodeBlockAsm1K JMP memmove_long_repeat_emit_lits_encodeBlockAsm1K one_byte_repeat_emit_lits_encodeBlockAsm1K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm1K memmove_midrepeat_emit_lits_encodeBlockAsm1K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K: MOVQ R9, CX JMP repeat_emit_lits_end_encodeBlockAsm1K memmove_long_repeat_emit_lits_encodeBlockAsm1K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ SI, R12 JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX repeat_emit_lits_end_encodeBlockAsm1K: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K matchlen_loopback_16_repeat_extend_encodeBlockAsm1K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm1K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm1K LEAL -16(R8), R8 LEAL 16(R11), R11 matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K: CMPL R8, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm1K JMP matchlen_match8_repeat_extend_encodeBlockAsm1K matchlen_bsf_16repeat_extend_encodeBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm1K matchlen_match8_repeat_extend_encodeBlockAsm1K: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm1K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm1K LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeBlockAsm1K matchlen_bsf_8_repeat_extend_encodeBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm1K matchlen_match4_repeat_extend_encodeBlockAsm1K: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm1K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm1K LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm1K: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm1K JB repeat_extend_forward_end_encodeBlockAsm1K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm1K LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeBlockAsm1K matchlen_match1_repeat_extend_encodeBlockAsm1K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm1K LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm1K: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitRepeat LEAL -1(SI), DI CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBlockAsm1K LEAL -30(SI), DI CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBlockAsm1K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBlockAsm1K MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm1K repeat_three_match_repeat_encodeBlockAsm1K: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm1K repeat_two_match_repeat_encodeBlockAsm1K: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm1K repeat_one_match_repeat_encodeBlockAsm1K: XORL DI, DI LEAL -4(DI)(SI*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBlockAsm1K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm1K no_repeat_found_encodeBlockAsm1K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm1K SHRQ $0x08, DI MOVWLZX (AX)(R10*2), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm1K MOVW R9, (AX)(R10*2) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm1K MOVL 20(SP), DX JMP search_loop_encodeBlockAsm1K candidate3_match_encodeBlockAsm1K: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm1K candidate2_match_encodeBlockAsm1K: MOVW R9, (AX)(R10*2) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm1K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm1K match_extend_back_loop_encodeBlockAsm1K: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm1K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm1K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm1K JMP match_extend_back_loop_encodeBlockAsm1K match_extend_back_end_encodeBlockAsm1K: CMPQ CX, (SP) JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K matchlen_loopback_16_match_nolit_encodeBlockAsm1K: MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm1K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm1K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm1K JMP matchlen_match8_match_nolit_encodeBlockAsm1K matchlen_bsf_16match_nolit_encodeBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBlockAsm1K matchlen_match8_match_nolit_encodeBlockAsm1K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm1K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm1K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm1K matchlen_bsf_8_match_nolit_encodeBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBlockAsm1K matchlen_match4_match_nolit_encodeBlockAsm1K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm1K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBlockAsm1K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBlockAsm1K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm1K JB match_nolit_end_encodeBlockAsm1K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBlockAsm1K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm1K matchlen_match1_match_nolit_encodeBlockAsm1K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm1K LEAL 1(R11), R11 match_nolit_end_encodeBlockAsm1K: ADDL R11, DX ADDL $0x04, R11 MOVL 16(SP), SI MOVL 12(SP), DI MOVL DX, 12(SP) SUBL DI, R8 JZ match_nolits_copy_encodeBlockAsm1K LEAQ (BX)(DI*1), DI CMPL R8, $0x03 JA match_emit_lits_copy_encodeBlockAsm1K CMPL SI, $0x40 JB match_emit_lits_copy_encodeBlockAsm1K MOVL (DI), DI // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, SI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, SI CMOVLLT R11, SI LEAL -1(R8)(SI*4), SI MOVL $0x00000003, R10 LEAL (R10)(SI*8), SI MOVB SI, (CX) ADDQ $0x03, CX MOVL DI, (CX) ADDQ R8, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm1K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm1K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm1K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K repeat_three_match_emit_repeat_copy2_encodeBlockAsm1K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K repeat_two_match_emit_repeat_copy2_encodeBlockAsm1K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K match_emit_lits_copy_encodeBlockAsm1K: LEAQ 3(CX)(R8*1), R9 CMPQ R9, (SP) JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_3: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBlockAsm1K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm1K JB three_bytes_match_emit_encodeBlockAsm1K three_bytes_match_emit_encodeBlockAsm1K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBlockAsm1K two_bytes_match_emit_encodeBlockAsm1K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBlockAsm1K JMP memmove_long_match_emit_encodeBlockAsm1K one_byte_match_emit_encodeBlockAsm1K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16: MOVOU (DI), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm1K emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm1K emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm1K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm1K memmove_midmatch_emit_encodeBlockAsm1K: LEAQ (CX)(R8*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBlockAsm1K emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBlockAsm1K: MOVQ R9, CX JMP match_nolits_copy_encodeBlockAsm1K memmove_long_match_emit_encodeBlockAsm1K: LEAQ (CX)(R8*1), R9 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R8*1), X2 MOVOU -16(DI)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(DI)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(DI)(R13*1), X4 MOVOU -16(DI)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX match_nolits_copy_encodeBlockAsm1K: // emitCopy CMPL SI, $0x00000400 JA two_byte_match_nolit_encodeBlockAsm1K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBlockAsm1K LEAL -1(SI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K emit_one_longer_match_nolit_encodeBlockAsm1K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBlockAsm1K LEAL -1(SI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K emit_copy1_repeat_match_nolit_encodeBlockAsm1K: LEAL -1(SI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K two_byte_match_nolit_encodeBlockAsm1K: // emitCopy2 LEAL -64(SI), SI LEAL -4(R11), R11 MOVW SI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBlockAsm1K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBlockAsm1K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBlockAsm1K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K emit_copy2_2_match_nolit_encodeBlockAsm1K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K emit_copy2_1_match_nolit_encodeBlockAsm1K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm1K emit_copy2_0_match_nolit_encodeBlockAsm1K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm1K: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm1K MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm1K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm1K: MOVQ $0x9e3779b1, SI MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 SHLQ $0x20, R8 IMULQ SI, R8 SHRQ $0x37, R8 SHLQ $0x20, R9 IMULQ SI, R9 SHRQ $0x37, R9 LEAL -2(DX), R10 MOVWLZX (AX)(R9*2), SI MOVW R10, (AX)(R8*2) MOVW DX, (AX)(R9*2) MOVL DX, R8 INCL DX CMPL (BX)(SI*1), DI JNE search_loop_encodeBlockAsm1K MOVL R8, DI SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_4: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K matchlen_loopback_16_match_nolit2_encodeBlockAsm1K: MOVQ (R8)(R11*1), R9 MOVQ 8(R8)(R11*1), R10 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm1K XORQ 8(SI)(R11*1), R10 JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm1K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm1K JMP matchlen_match8_match_nolit2_encodeBlockAsm1K matchlen_bsf_16match_nolit2_encodeBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL 8(R11)(R10*1), R11 JMP match_nolit2_end_encodeBlockAsm1K matchlen_match8_match_nolit2_encodeBlockAsm1K: CMPL DI, $0x08 JB matchlen_match4_match_nolit2_encodeBlockAsm1K MOVQ (R8)(R11*1), R9 XORQ (SI)(R11*1), R9 JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm1K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm1K matchlen_bsf_8_match_nolit2_encodeBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R11)(R9*1), R11 JMP match_nolit2_end_encodeBlockAsm1K matchlen_match4_match_nolit2_encodeBlockAsm1K: CMPL DI, $0x04 JB matchlen_match2_match_nolit2_encodeBlockAsm1K MOVL (R8)(R11*1), R9 CMPL (SI)(R11*1), R9 JNE matchlen_match2_match_nolit2_encodeBlockAsm1K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit2_encodeBlockAsm1K: CMPL DI, $0x01 JE matchlen_match1_match_nolit2_encodeBlockAsm1K JB match_nolit2_end_encodeBlockAsm1K MOVW (R8)(R11*1), R9 CMPW (SI)(R11*1), R9 JNE matchlen_match1_match_nolit2_encodeBlockAsm1K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit2_end_encodeBlockAsm1K matchlen_match1_match_nolit2_encodeBlockAsm1K: MOVB (R8)(R11*1), R9 CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm1K LEAL 1(R11), R11 match_nolit2_end_encodeBlockAsm1K: ADDL R11, DX ADDL $0x04, R11 MOVL DX, 12(SP) MOVL 16(SP), SI JMP match_nolits_copy_encodeBlockAsm1K emit_remainder_encodeBlockAsm1K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX SUBL DX, AX JZ emit_remainder_end_encodeBlockAsm1K LEAQ (BX)(DX*1), DX LEAQ 3(CX)(AX*1), BX CMPQ BX, (SP) JB dst_size_check_ok_5 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_5: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d JB one_byte_emit_remainder_encodeBlockAsm1K SUBL $0x1d, BX CMPL BX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm1K JB three_bytes_emit_remainder_encodeBlockAsm1K three_bytes_emit_remainder_encodeBlockAsm1K: MOVB $0xf0, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, BX JMP memmove_long_emit_remainder_encodeBlockAsm1K two_bytes_emit_remainder_encodeBlockAsm1K: MOVB $0xe8, (CX) MOVB BL, 1(CX) ADDL $0x1d, BX ADDQ $0x02, CX CMPL BX, $0x40 JB memmove_midemit_remainder_encodeBlockAsm1K JMP memmove_long_emit_remainder_encodeBlockAsm1K one_byte_emit_remainder_encodeBlockAsm1K: SHLB $0x03, BL MOVB BL, (CX) ADDQ $0x01, CX LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 1 CMPQ AX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_3 CMPQ AX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8 CMPQ AX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16 CMPQ AX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_1or2: MOVB (DX), SI MOVB -1(DX)(AX*1), DL MOVB SI, (CX) MOVB DL, -1(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_3: MOVW (DX), SI MOVB 2(DX), DL MOVW SI, (CX) MOVB DL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8: MOVL (DX), SI MOVL -4(DX)(AX*1), DX MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16: MOVQ (DX), SI MOVQ -8(DX)(AX*1), DX MOVQ SI, (CX) MOVQ DX, -8(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_end_copy_emit_remainder_encodeBlockAsm1K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm1K memmove_midemit_remainder_encodeBlockAsm1K: LEAQ (CX)(AX*1), BX // genMemMoveShort // margin: 0, min move: 30 CMPQ AX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(AX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(AX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm1K emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) memmove_mid_end_copy_emit_remainder_encodeBlockAsm1K: MOVQ BX, CX JMP emit_remainder_end_encodeBlockAsm1K memmove_long_emit_remainder_encodeBlockAsm1K: LEAQ (CX)(AX*1), BX // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(AX*1), X2 MOVOU -16(DX)(AX*1), X3 MOVQ AX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(DX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(DX)(R8*1), X4 MOVOU -16(DX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ AX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(AX*1) MOVOU X3, -16(CX)(AX*1) MOVQ BX, CX emit_remainder_end_encodeBlockAsm1K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00001200, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -17(AX), DX LEAQ -17(AX), DI MOVL DI, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm: MOVQ tmp+48(FP), DI MOVL AX, R8 SUBL 12(SP), R8 SHRL $0x08, R8 CMPL R8, $0x63 JBE check_maxskip_ok_encodeBetterBlockAsm LEAL 100(AX), R8 JMP check_maxskip_cont_encodeBetterBlockAsm check_maxskip_ok_encodeBetterBlockAsm: LEAL 1(AX)(R8*1), R8 check_maxskip_cont_encodeBetterBlockAsm: CMPL R8, 8(SP) JAE emit_remainder_encodeBetterBlockAsm MOVQ (DX)(AX*1), R9 MOVL R8, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R11 MOVQ $0x9e3779b1, R8 MOVQ R9, R12 MOVQ R9, R13 SHLQ $0x08, R12 IMULQ R11, R12 SHRQ $0x2f, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x32, R13 MOVL (DI)(R12*4), R8 MOVL 524288(DI)(R13*4), R10 MOVL AX, (DI)(R12*4) MOVL AX, 524288(DI)(R13*4) LEAL -2162685(AX), R12 CMPL R8, R12 JLE offset_ok_0_encodeBetterBlockAsm MOVQ (DX)(R8*1), BX CMPQ BX, R9 JEQ candidate_match_encodeBetterBlockAsm offset_ok_0_encodeBetterBlockAsm: CMPL R10, R12 JLE offset_ok_1_encodeBetterBlockAsm MOVQ (DX)(R10*1), SI CMPQ SI, R9 offset_ok_1_encodeBetterBlockAsm: MOVL AX, R13 SUBL 16(SP), R13 MOVQ (DX)(R13*1), R13 MOVQ $0x000000ffffffff00, R14 XORQ R9, R13 TESTQ R14, R13 JNE no_repeat_found_encodeBetterBlockAsm LEAL 1(AX), DI MOVL 12(SP), R8 MOVL DI, R9 SUBL 16(SP), R9 JZ repeat_extend_back_end_encodeBetterBlockAsm repeat_extend_back_loop_encodeBetterBlockAsm: CMPL DI, R8 JBE repeat_extend_back_end_encodeBetterBlockAsm MOVB -1(DX)(R9*1), R10 MOVB -1(DX)(DI*1), R11 CMPB R10, R11 JNE repeat_extend_back_end_encodeBetterBlockAsm LEAL -1(DI), DI DECL R9 JNZ repeat_extend_back_loop_encodeBetterBlockAsm repeat_extend_back_end_encodeBetterBlockAsm: MOVL DI, R8 SUBL 12(SP), R8 LEAQ 4(CX)(R8*1), R8 CMPQ R8, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm: // emitLiteralsDstP MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (DX)(R8*1), R10 SUBL R8, R9 // emitLiteral LEAL -1(R9), R8 CMPL R8, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm SUBL $0x1d, R8 CMPL R8, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm CMPL R8, $0x00010000 JB three_bytes_repeat_emit_encodeBetterBlockAsm MOVL R8, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW R8, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R8 JMP memmove_long_repeat_emit_encodeBetterBlockAsm three_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R8 JMP memmove_long_repeat_emit_encodeBetterBlockAsm two_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xe8, (CX) MOVB R8, 1(CX) ADDL $0x1d, R8 ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm JMP memmove_long_repeat_emit_encodeBetterBlockAsm one_byte_repeat_emit_encodeBetterBlockAsm: SHLB $0x03, R8 MOVB R8, (CX) ADDQ $0x01, CX LEAQ (CX)(R9*1), R8 // genMemMoveShort // margin: 16, min move: 1 CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: MOVOU (R10), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm: MOVQ R8, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm memmove_midrepeat_emit_encodeBetterBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveShort // margin: 15, min move: 30 CMPQ R9, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm: MOVQ R8, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm memmove_long_repeat_emit_encodeBetterBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R13*1), R11 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R13*1), X4 MOVOU -16(R10)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm: ADDL $0x05, AX MOVL AX, R8 SUBL 16(SP), R8 MOVQ src_len+32(FP), R9 SUBL AX, R9 LEAQ (DX)(AX*1), R10 LEAQ (DX)(R8*1), R8 // matchLen XORL R12, R12 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm: MOVQ (R10)(R12*1), R11 MOVQ 8(R10)(R12*1), R13 XORQ (R8)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm XORQ 8(R8)(R12*1), R13 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm LEAL -16(R9), R9 LEAL 16(R12), R12 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm: CMPL R9, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm matchlen_bsf_16repeat_extend_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match8_repeat_extend_encodeBetterBlockAsm: CMPL R9, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm MOVQ (R10)(R12*1), R11 XORQ (R8)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm LEAL -8(R9), R9 LEAL 8(R12), R12 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match4_repeat_extend_encodeBetterBlockAsm: CMPL R9, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm MOVL (R10)(R12*1), R11 CMPL (R8)(R12*1), R11 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm LEAL -4(R9), R9 LEAL 4(R12), R12 matchlen_match2_repeat_extend_encodeBetterBlockAsm: CMPL R9, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm JB repeat_extend_forward_end_encodeBetterBlockAsm MOVW (R10)(R12*1), R11 CMPW (R8)(R12*1), R11 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm LEAL 2(R12), R12 SUBL $0x02, R9 JZ repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match1_repeat_extend_encodeBetterBlockAsm: MOVB (R10)(R12*1), R11 CMPB (R8)(R12*1), R11 JNE repeat_extend_forward_end_encodeBetterBlockAsm LEAL 1(R12), R12 repeat_extend_forward_end_encodeBetterBlockAsm: ADDL R12, AX MOVL AX, R8 SUBL DI, R8 MOVL 16(SP), DI // emitRepeat LEAL -1(R8), DI CMPL R8, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm LEAL -30(R8), DI CMPL R8, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm CMPL R8, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL DI, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_three_match_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_two_match_repeat_encodeBetterBlockAsm: MOVB $0xec, (CX) MOVB DI, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_one_match_repeat_encodeBetterBlockAsm: XORL DI, DI LEAL -4(DI)(R8*8), DI MOVB DI, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm no_repeat_found_encodeBetterBlockAsm: CMPL R8, R12 JLE offset_ok_2_encodeBetterBlockAsm CMPL BX, R9 JEQ candidate_match_encodeBetterBlockAsm offset_ok_2_encodeBetterBlockAsm: CMPL R10, R12 JLE offset_ok_3_encodeBetterBlockAsm CMPL SI, R9 JEQ candidateS_match_encodeBetterBlockAsm offset_ok_3_encodeBetterBlockAsm: MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: SHRQ $0x08, R9 MOVQ R9, R13 SHLQ $0x08, R13 IMULQ R11, R13 SHRQ $0x2f, R13 MOVL (DI)(R13*4), R8 INCL AX MOVL AX, (DI)(R13*4) CMPL R8, R12 JLE offset_ok_4_encodeBetterBlockAsm CMPL (DX)(R8*1), R9 JEQ candidate_match_encodeBetterBlockAsm offset_ok_4_encodeBetterBlockAsm: DECL AX MOVL R10, R8 candidate_match_encodeBetterBlockAsm: MOVL 12(SP), DI TESTL R8, R8 JZ match_extend_back_end_encodeBetterBlockAsm match_extend_back_loop_encodeBetterBlockAsm: CMPL AX, DI JBE match_extend_back_end_encodeBetterBlockAsm MOVB -1(DX)(R8*1), R9 MOVB -1(DX)(AX*1), R10 CMPB R9, R10 JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(AX), AX DECL R8 JZ match_extend_back_end_encodeBetterBlockAsm JMP match_extend_back_loop_encodeBetterBlockAsm match_extend_back_end_encodeBetterBlockAsm: MOVL AX, DI SUBL 12(SP), DI LEAQ 4(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm: MOVL AX, DI ADDL $0x04, AX ADDL $0x04, R8 MOVQ src_len+32(FP), R9 SUBL AX, R9 LEAQ (DX)(AX*1), R10 LEAQ (DX)(R8*1), R11 // matchLen XORL R13, R13 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: MOVQ (R10)(R13*1), R12 MOVQ 8(R10)(R13*1), R14 XORQ (R11)(R13*1), R12 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm XORQ 8(R11)(R13*1), R14 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm LEAL -16(R9), R9 LEAL 16(R13), R13 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm: CMPL R9, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm JMP matchlen_match8_match_nolit_encodeBetterBlockAsm matchlen_bsf_16match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R14, R14 #else BSFQ R14, R14 #endif SARQ $0x03, R14 LEAL 8(R13)(R14*1), R13 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match8_match_nolit_encodeBetterBlockAsm: CMPL R9, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm MOVQ (R10)(R13*1), R12 XORQ (R11)(R13*1), R12 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm LEAL -8(R9), R9 LEAL 8(R13), R13 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL (R13)(R12*1), R13 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match4_match_nolit_encodeBetterBlockAsm: CMPL R9, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm MOVL (R10)(R13*1), R12 CMPL (R11)(R13*1), R12 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm LEAL -4(R9), R9 LEAL 4(R13), R13 matchlen_match2_match_nolit_encodeBetterBlockAsm: CMPL R9, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm JB match_nolit_end_encodeBetterBlockAsm MOVW (R10)(R13*1), R12 CMPW (R11)(R13*1), R12 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm LEAL 2(R13), R13 SUBL $0x02, R9 JZ match_nolit_end_encodeBetterBlockAsm matchlen_match1_match_nolit_encodeBetterBlockAsm: MOVB (R10)(R13*1), R12 CMPB (R11)(R13*1), R12 JNE match_nolit_end_encodeBetterBlockAsm LEAL 1(R13), R13 match_nolit_end_encodeBetterBlockAsm: MOVL AX, R9 SUBL R8, R9 CMPL R13, $0x01 JA match_length_ok_encodeBetterBlockAsm CMPL R9, $0x0001003f JBE match_length_ok_encodeBetterBlockAsm MOVL 20(SP), AX INCL AX JMP search_loop_encodeBetterBlockAsm match_length_ok_encodeBetterBlockAsm: MOVL R9, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R10 MOVL DI, R8 SUBL R10, R8 JZ match_emit_nolits_encodeBetterBlockAsm CMPL R9, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm CMPL R9, $0x0001003f JA match_emit_copy3_encodeBetterBlockAsm CMPL R8, $0x04 JA match_emit_lits_encodeBetterBlockAsm MOVL (DX)(R10*1), R10 ADDL R13, AX ADDL $0x04, R13 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R11, R11 SUBL $0x40, R9 LEAL -11(R13), R12 LEAL -4(R13), R13 MOVW R9, 1(CX) CMPL R13, $0x07 CMOVLGE R12, R11 MOVQ $0x00000007, R9 CMOVLLT R13, R9 LEAL -1(R8)(R9*4), R9 MOVL $0x00000003, R12 LEAL (R12)(R9*8), R9 MOVB R9, (CX) ADDQ $0x03, CX MOVL R10, (CX) ADDQ R8, CX TESTL R11, R11 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm // emitRepeat LEAL -1(R11), R8 CMPL R11, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm LEAL -30(R11), R8 CMPL R11, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm CMPL R11, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL R8, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm: MOVB $0xec, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm: XORL R8, R8 LEAL -4(R8)(R11*8), R8 MOVB R8, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_emit_copy3_encodeBetterBlockAsm: CMPL R8, $0x03 JA match_emit_lits_encodeBetterBlockAsm MOVLQZX 12(SP), R10 MOVL (DX)(R10*1), R10 ADDL R13, AX ADDL $0x04, R13 MOVL AX, 12(SP) // emitCopy3 LEAL -4(R13), R13 LEAL -65536(R9), R9 SHLL $0x0b, R9 LEAL 7(R9)(R8*8), R9 CMPL R13, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm LEAL -60(R13), R11 CMPL R13, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm CMPL R13, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm ADDL $0x000007e0, R9 MOVL R9, (CX) MOVL R11, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_2_match_emit_lits_encodeBetterBlockAsm: ADDL $0x000007c0, R9 MOVL R9, (CX) MOVW R11, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_1_match_emit_lits_encodeBetterBlockAsm: ADDL $0x000007a0, R9 MOVL R9, (CX) MOVB R11, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_0_match_emit_lits_encodeBetterBlockAsm: SHLL $0x05, R13 ORL R13, R9 MOVL R9, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBetterBlockAsm: MOVL R10, (CX) ADDQ R8, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_emit_lits_encodeBetterBlockAsm: LEAQ (DX)(R10*1), R10 // emitLiteral LEAL -1(R8), R11 CMPL R11, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm SUBL $0x1d, R11 CMPL R11, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm CMPL R11, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm MOVL R11, R12 SHRL $0x10, R12 MOVB $0xf8, (CX) MOVW R11, 1(CX) MOVB R12, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R11 JMP memmove_long_match_emit_encodeBetterBlockAsm three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVW R11, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R11 JMP memmove_long_match_emit_encodeBetterBlockAsm two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xe8, (CX) MOVB R11, 1(CX) ADDL $0x1d, R11 ADDQ $0x02, CX CMPL R11, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm JMP memmove_long_match_emit_encodeBetterBlockAsm one_byte_match_emit_encodeBetterBlockAsm: SHLB $0x03, R11 MOVB R11, (CX) ADDQ $0x01, CX LEAQ (CX)(R8*1), R11 // genMemMoveShort // margin: 16, min move: 1 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: MOVOU (R10), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R8*1), X2 MOVOU -16(R10)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_encodeBetterBlockAsm: MOVQ R11, CX JMP match_emit_nolits_encodeBetterBlockAsm memmove_midmatch_emit_encodeBetterBlockAsm: LEAQ (CX)(R8*1), R11 // genMemMoveShort // margin: 15, min move: 30 CMPQ R8, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R8*1), X2 MOVOU -16(R10)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm: MOVQ R11, CX JMP match_emit_nolits_encodeBetterBlockAsm memmove_long_match_emit_encodeBetterBlockAsm: LEAQ (CX)(R8*1), R11 // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R8*1), X2 MOVOU -16(R10)(R8*1), X3 MOVQ R8, R14 SHRQ $0x05, R14 MOVQ CX, R12 ANDL $0x0000001f, R12 MOVQ $0x00000040, R15 SUBQ R12, R15 DECQ R14 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R15*1), R12 LEAQ -32(CX)(R15*1), BP emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R12), X4 MOVOU 16(R12), X5 MOVOA X4, (BP) MOVOA X5, 16(BP) ADDQ $0x20, BP ADDQ $0x20, R12 ADDQ $0x20, R15 DECQ R14 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R15*1), X4 MOVOU -16(R10)(R15*1), X5 MOVOA X4, -32(CX)(R15*1) MOVOA X5, -16(CX)(R15*1) ADDQ $0x20, R15 CMPQ R8, R15 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R11, CX match_emit_nolits_encodeBetterBlockAsm: ADDL R13, AX ADDL $0x04, R13 MOVL AX, 12(SP) // emitCopy CMPL R9, $0x0001003f JBE two_byte_offset_match_nolit_encodeBetterBlockAsm // emitCopy3 LEAL -4(R13), R13 LEAL -65536(R9), R8 SHLL $0x0b, R8 ADDL $0x07, R8 CMPL R13, $0x3c JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3 LEAL -60(R13), R9 CMPL R13, $0x0000013c JB emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3 CMPL R13, $0x0001003c JB emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3 ADDL $0x000007e0, R8 MOVL R8, (CX) MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3: ADDL $0x000007c0, R8 MOVL R8, (CX) MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3: ADDL $0x000007a0, R8 MOVL R8, (CX) MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3: SHLL $0x05, R13 ORL R13, R8 MOVL R8, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_match_nolit_encodeBetterBlockAsm: CMPL R9, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm CMPL R13, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm LEAL -1(R9), R8 SHLL $0x06, R8 LEAL -15(R8)(R13*4), R8 MOVW R8, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_one_longer_match_nolit_encodeBetterBlockAsm: CMPL R13, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm LEAL -1(R9), R8 SHLL $0x06, R8 LEAL 61(R8), R8 MOVW R8, (CX) LEAL -18(R13), R8 MOVB R8, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy1_repeat_match_nolit_encodeBetterBlockAsm: LEAL -1(R9), R8 SHLL $0x06, R8 LEAL 57(R8), R8 MOVW R8, (CX) ADDQ $0x02, CX SUBL $0x12, R13 // emitRepeat LEAL -1(R13), R8 CMPL R13, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm LEAL -30(R13), R8 CMPL R13, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm CMPL R13, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL R8, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: MOVB $0xec, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: XORL R8, R8 LEAL -4(R8)(R13*8), R8 MOVB R8, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_match_nolit_encodeBetterBlockAsm: // emitCopy2 LEAL -64(R9), R9 LEAL -4(R13), R13 MOVW R9, 1(CX) CMPL R13, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2 LEAL -60(R13), R8 CMPL R13, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2 CMPL R13, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2 MOVB $0xfe, (CX) MOVL R8, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2: MOVB $0xfa, (CX) MOVW R8, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2: MOVB $0xf6, (CX) MOVB R8, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2: MOVL $0x00000002, R8 LEAL (R8)(R13*4), R8 MOVB R8, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm // emitLiteralsDstP MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (DX)(R8*1), R10 SUBL R8, R9 // emitLiteral LEAL -1(R9), R8 CMPL R8, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm SUBL $0x1d, R8 CMPL R8, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm CMPL R8, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm MOVL R8, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW R8, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R8 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm three_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R8 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm two_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xe8, (CX) MOVB R8, 1(CX) ADDL $0x1d, R8 ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm one_byte_match_emit_repeat_encodeBetterBlockAsm: SHLB $0x03, R8 MOVB R8, (CX) ADDQ $0x01, CX LEAQ (CX)(R9*1), R8 // genMemMoveShort // margin: 16, min move: 1 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: MOVOU (R10), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: MOVQ R8, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_midmatch_emit_repeat_encodeBetterBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveShort // margin: 15, min move: 30 CMPQ R9, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm: MOVQ R8, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_long_match_emit_repeat_encodeBetterBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R12 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R12 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: ADDL R13, AX ADDL $0x04, R13 MOVL AX, 12(SP) // emitRepeat LEAL -1(R13), R8 CMPL R13, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm LEAL -30(R13), R8 CMPL R13, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm CMPL R13, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL R8, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_repeat_encodeBetterBlockAsm: MOVB $0xec, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_match_nolit_repeat_encodeBetterBlockAsm: XORL R8, R8 LEAL -4(R8)(R13*8), R8 MOVB R8, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ tmp+48(FP), R8 MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, R10 LEAQ 1(DI), DI LEAQ -2(AX), R11 MOVQ (DX)(DI*1), R12 MOVQ 1(DX)(DI*1), R13 MOVQ (DX)(R11*1), R14 MOVQ 1(DX)(R11*1), R15 SHLQ $0x08, R12 IMULQ R9, R12 SHRQ $0x2f, R12 SHLQ $0x20, R13 IMULQ R10, R13 SHRQ $0x32, R13 SHLQ $0x08, R14 IMULQ R9, R14 SHRQ $0x2f, R14 SHLQ $0x20, R15 IMULQ R10, R15 SHRQ $0x32, R15 LEAQ 1(DI), R10 LEAQ 1(R11), BP MOVL DI, (R8)(R12*4) MOVL R11, (R8)(R14*4) LEAQ 1(R11)(DI*1), R12 SHRQ $0x01, R12 ADDQ $0x01, DI SUBQ $0x01, R11 MOVL R10, 524288(R8)(R13*4) MOVL BP, 524288(R8)(R15*4) index_loop_encodeBetterBlockAsm: CMPQ R12, R11 JAE search_loop_encodeBetterBlockAsm MOVQ (DX)(DI*1), R10 MOVQ (DX)(R12*1), R13 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 SHLQ $0x08, R13 IMULQ R9, R13 SHRQ $0x2f, R13 MOVL DI, (R8)(R10*4) MOVL R11, (R8)(R13*4) ADDQ $0x02, DI ADDQ $0x02, R12 JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 4(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBetterBlockAsm MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm three_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm two_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm JMP memmove_long_emit_remainder_encodeBetterBlockAsm one_byte_emit_remainder_encodeBetterBlockAsm: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm memmove_midemit_remainder_encodeBetterBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm memmove_long_emit_remainder_encodeBetterBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm2MB(dst []byte, src []byte, tmp *[589824]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm2MB(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00001200, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm2MB: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm2MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -17(AX), DX LEAQ -17(AX), BX MOVL BX, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm2MB: MOVQ tmp+48(FP), BX MOVL AX, SI SUBL 12(SP), SI SHRL $0x07, SI CMPL SI, $0x63 JBE check_maxskip_ok_encodeBetterBlockAsm2MB LEAL 100(AX), SI JMP check_maxskip_cont_encodeBetterBlockAsm2MB check_maxskip_ok_encodeBetterBlockAsm2MB: LEAL 1(AX)(SI*1), SI check_maxskip_cont_encodeBetterBlockAsm2MB: CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm2MB MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL (BX)(R10*4), SI MOVL 524288(BX)(R11*4), R8 MOVL AX, (BX)(R10*4) MOVL AX, 524288(BX)(R11*4) MOVQ (DX)(SI*1), R10 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm2MB MOVQ (DX)(R8*1), R11 CMPQ R11, DI MOVL AX, R12 SUBL 16(SP), R12 MOVQ (DX)(R12*1), R12 MOVQ $0x000000ffffffff00, R13 XORQ DI, R12 TESTQ R13, R12 JNE no_repeat_found_encodeBetterBlockAsm2MB LEAL 1(AX), BX MOVL 12(SP), SI MOVL BX, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeBetterBlockAsm2MB repeat_extend_back_loop_encodeBetterBlockAsm2MB: CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm2MB MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(BX*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm2MB LEAL -1(BX), BX DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm2MB repeat_extend_back_end_encodeBetterBlockAsm2MB: MOVL BX, SI SUBL 12(SP), SI LEAQ 4(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm2MB MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm2MB: // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm2MB SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm2MB CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeBetterBlockAsm2MB MOVL SI, R9 SHRL $0x10, R9 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R9, 3(CX) ADDQ $0x04, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm2MB three_bytes_repeat_emit_encodeBetterBlockAsm2MB: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm2MB two_bytes_repeat_emit_encodeBetterBlockAsm2MB: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm2MB JMP memmove_long_repeat_emit_encodeBetterBlockAsm2MB one_byte_repeat_emit_encodeBetterBlockAsm2MB: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 16, min move: 1 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB memmove_midrepeat_emit_encodeBetterBlockAsm2MB: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 15, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm2MB emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm2MB: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB memmove_long_repeat_emit_encodeBetterBlockAsm2MB: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(CX)(R11*1) MOVOA X5, -16(CX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB: ADDL $0x05, AX MOVL AX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), SI // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB: MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB LEAL -16(DI), DI LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm2MB matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm2MB matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB: CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB JB repeat_extend_forward_end_encodeBetterBlockAsm2MB MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB LEAL 2(R10), R10 SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm2MB matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm2MB LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm2MB: ADDL R10, AX MOVL AX, SI SUBL BX, SI MOVL 16(SP), BX // emitRepeat LEAL -1(SI), BX CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm2MB LEAL -30(SI), BX CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm2MB CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm2MB MOVB $0xfc, (CX) MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm2MB repeat_three_match_repeat_encodeBetterBlockAsm2MB: MOVB $0xf4, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm2MB repeat_two_match_repeat_encodeBetterBlockAsm2MB: MOVB $0xec, (CX) MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm2MB repeat_one_match_repeat_encodeBetterBlockAsm2MB: XORL BX, BX LEAL -4(BX)(SI*8), BX MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm2MB: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm2MB no_repeat_found_encodeBetterBlockAsm2MB: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm2MB CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm2MB MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm2MB candidateS_match_encodeBetterBlockAsm2MB: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 MOVL (BX)(R10*4), SI INCL AX MOVL AX, (BX)(R10*4) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm2MB DECL AX MOVL R8, SI candidate_match_encodeBetterBlockAsm2MB: MOVL 12(SP), BX TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm2MB match_extend_back_loop_encodeBetterBlockAsm2MB: CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm2MB MOVB -1(DX)(SI*1), DI MOVB -1(DX)(AX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm2MB LEAL -1(AX), AX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm2MB JMP match_extend_back_loop_encodeBetterBlockAsm2MB match_extend_back_end_encodeBetterBlockAsm2MB: MOVL AX, BX SUBL 12(SP), BX LEAQ 4(CX)(BX*1), BX CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm2MB MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm2MB: MOVL AX, BX ADDL $0x04, AX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), R9 // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB: MOVQ (R8)(R11*1), R10 MOVQ 8(R8)(R11*1), R12 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB JMP matchlen_match8_match_nolit_encodeBetterBlockAsm2MB matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm2MB matchlen_match8_match_nolit_encodeBetterBlockAsm2MB: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm2MB MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm2MB matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm2MB matchlen_match4_match_nolit_encodeBetterBlockAsm2MB: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm2MB MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm2MB LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm2MB: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm2MB JB match_nolit_end_encodeBetterBlockAsm2MB MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm2MB LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm2MB matchlen_match1_match_nolit_encodeBetterBlockAsm2MB: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm2MB LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm2MB: MOVL AX, DI SUBL SI, DI CMPL R11, $0x01 JA match_length_ok_encodeBetterBlockAsm2MB CMPL DI, $0x0001003f JBE match_length_ok_encodeBetterBlockAsm2MB MOVL 20(SP), AX INCL AX JMP search_loop_encodeBetterBlockAsm2MB match_length_ok_encodeBetterBlockAsm2MB: MOVL DI, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R8 MOVL BX, SI SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm2MB CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm2MB CMPL DI, $0x0001003f JA match_emit_copy3_encodeBetterBlockAsm2MB CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm2MB MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, DI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, DI CMOVLLT R11, DI LEAL -1(SI)(DI*4), DI MOVL $0x00000003, R10 LEAL (R10)(DI*8), DI MOVB DI, (CX) ADDQ $0x03, CX MOVL R8, (CX) ADDQ SI, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm2MB // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm2MB LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm2MB CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm2MB MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm2MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm2MB: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm2MB: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB match_emit_copy3_encodeBetterBlockAsm2MB: CMPL SI, $0x03 JA match_emit_lits_encodeBetterBlockAsm2MB MOVLQZX 12(SP), R8 MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy3 LEAL -4(R11), R11 LEAL -65536(DI), DI SHLL $0x0b, DI LEAL 7(DI)(SI*8), DI CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm2MB LEAL -60(R11), R9 CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm2MB CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm2MB ADDL $0x000007e0, DI MOVL DI, (CX) MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBetterBlockAsm2MB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm2MB: ADDL $0x000007c0, DI MOVL DI, (CX) MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBetterBlockAsm2MB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm2MB: ADDL $0x000007a0, DI MOVL DI, (CX) MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBetterBlockAsm2MB emit_copy3_0_match_emit_lits_encodeBetterBlockAsm2MB: SHLL $0x05, R11 ORL R11, DI MOVL DI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBetterBlockAsm2MB: MOVL R8, (CX) ADDQ SI, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB match_emit_lits_encodeBetterBlockAsm2MB: LEAQ (DX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm2MB SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm2MB CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm2MB MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm2MB three_bytes_match_emit_encodeBetterBlockAsm2MB: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm2MB two_bytes_match_emit_encodeBetterBlockAsm2MB: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm2MB JMP memmove_long_match_emit_encodeBetterBlockAsm2MB one_byte_match_emit_encodeBetterBlockAsm2MB: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm2MB emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm2MB emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm2MB: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm2MB memmove_midmatch_emit_encodeBetterBlockAsm2MB: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm2MB emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm2MB: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm2MB memmove_long_match_emit_encodeBetterBlockAsm2MB: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm2MB: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy CMPL DI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBetterBlockAsm2MB // emitCopy3 LEAL -4(R11), R11 LEAL -65536(DI), SI SHLL $0x0b, SI ADDL $0x07, SI CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm2MB_emit3 LEAL -60(R11), DI CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBetterBlockAsm2MB_emit3 CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBetterBlockAsm2MB_emit3 ADDL $0x000007e0, SI MOVL SI, (CX) MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy3_2_match_nolit_encodeBetterBlockAsm2MB_emit3: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy3_1_match_nolit_encodeBetterBlockAsm2MB_emit3: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy3_0_match_nolit_encodeBetterBlockAsm2MB_emit3: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB two_byte_offset_match_nolit_encodeBetterBlockAsm2MB: CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm2MB CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm2MB LEAL -1(DI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_one_longer_match_nolit_encodeBetterBlockAsm2MB: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm2MB LEAL -1(DI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy1_repeat_match_nolit_encodeBetterBlockAsm2MB: LEAL -1(DI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB two_byte_match_nolit_encodeBetterBlockAsm2MB: // emitCopy2 LEAL -64(DI), DI LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm2MB_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm2MB_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm2MB_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy2_2_match_nolit_encodeBetterBlockAsm2MB_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy2_1_match_nolit_encodeBetterBlockAsm2MB_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB emit_copy2_0_match_nolit_encodeBetterBlockAsm2MB_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm2MB SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm2MB CMPL SI, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm2MB MOVL SI, R9 SHRL $0x10, R9 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R9, 3(CX) ADDQ $0x04, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB three_bytes_match_emit_repeat_encodeBetterBlockAsm2MB: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB two_bytes_match_emit_repeat_encodeBetterBlockAsm2MB: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm2MB JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB one_byte_match_emit_repeat_encodeBetterBlockAsm2MB: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 16, min move: 1 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB memmove_midmatch_emit_repeat_encodeBetterBlockAsm2MB: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 15, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm2MB LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm2MB CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm2MB MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_three_match_nolit_repeat_encodeBetterBlockAsm2MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_two_match_nolit_repeat_encodeBetterBlockAsm2MB: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB repeat_one_match_nolit_repeat_encodeBetterBlockAsm2MB: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm2MB: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm2MB CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm2MB MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm2MB: MOVQ tmp+48(FP), SI MOVQ $0x00cf1bbcdcbfa563, DI MOVQ $0x9e3779b1, R8 LEAQ 1(BX), BX LEAQ -2(AX), R9 MOVQ (DX)(BX*1), R10 MOVQ 1(DX)(BX*1), R11 MOVQ (DX)(R9*1), R12 MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ DI, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R12 IMULQ DI, R12 SHRQ $0x2f, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x32, R13 LEAQ 1(BX), R8 LEAQ 1(R9), R14 MOVL BX, (SI)(R10*4) MOVL R9, (SI)(R12*4) LEAQ 1(R9)(BX*1), R10 SHRQ $0x01, R10 ADDQ $0x01, BX SUBQ $0x01, R9 MOVL R8, 524288(SI)(R11*4) MOVL R14, 524288(SI)(R13*4) index_loop_encodeBetterBlockAsm2MB: CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm2MB MOVQ (DX)(BX*1), R8 MOVQ (DX)(R10*1), R11 SHLQ $0x08, R8 IMULQ DI, R8 SHRQ $0x2f, R8 SHLQ $0x08, R11 IMULQ DI, R11 SHRQ $0x2f, R11 MOVL BX, (SI)(R8*4) MOVL R9, (SI)(R11*4) ADDQ $0x02, BX ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm2MB emit_remainder_encodeBetterBlockAsm2MB: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 4(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm2MB MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm2MB: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm2MB SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm2MB CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBetterBlockAsm2MB MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm2MB three_bytes_emit_remainder_encodeBetterBlockAsm2MB: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm2MB two_bytes_emit_remainder_encodeBetterBlockAsm2MB: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm2MB JMP memmove_long_emit_remainder_encodeBetterBlockAsm2MB one_byte_emit_remainder_encodeBetterBlockAsm2MB: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB memmove_midemit_remainder_encodeBetterBlockAsm2MB: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm2MB emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm2MB: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB memmove_long_emit_remainder_encodeBetterBlockAsm2MB: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm512K(dst []byte, src []byte, tmp *[294912]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm512K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000900, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm512K: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm512K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -11(AX), DX LEAQ -8(AX), BX MOVL BX, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm512K: MOVQ tmp+48(FP), BX MOVL AX, SI SUBL 12(SP), SI SHRL $0x07, SI CMPL SI, $0x63 JBE check_maxskip_ok_encodeBetterBlockAsm512K LEAL 100(AX), SI JMP check_maxskip_cont_encodeBetterBlockAsm512K check_maxskip_ok_encodeBetterBlockAsm512K: LEAL 1(AX)(SI*1), SI check_maxskip_cont_encodeBetterBlockAsm512K: CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm512K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x30, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x33, R11 MOVL (BX)(R10*4), SI MOVL 262144(BX)(R11*4), R8 MOVL AX, (BX)(R10*4) MOVL AX, 262144(BX)(R11*4) MOVQ (DX)(SI*1), R10 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm512K MOVQ (DX)(R8*1), R11 CMPQ R11, DI MOVL AX, R12 SUBL 16(SP), R12 MOVQ (DX)(R12*1), R12 MOVQ $0x000000ffffffff00, R13 XORQ DI, R12 TESTQ R13, R12 JNE no_repeat_found_encodeBetterBlockAsm512K LEAL 1(AX), BX MOVL 12(SP), SI MOVL BX, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeBetterBlockAsm512K repeat_extend_back_loop_encodeBetterBlockAsm512K: CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm512K MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(BX*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm512K LEAL -1(BX), BX DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm512K repeat_extend_back_end_encodeBetterBlockAsm512K: MOVL BX, SI SUBL 12(SP), SI LEAQ 4(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm512K MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm512K: // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm512K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm512K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm512K CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeBetterBlockAsm512K MOVL SI, R9 SHRL $0x10, R9 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R9, 3(CX) ADDQ $0x04, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm512K three_bytes_repeat_emit_encodeBetterBlockAsm512K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm512K two_bytes_repeat_emit_encodeBetterBlockAsm512K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm512K JMP memmove_long_repeat_emit_encodeBetterBlockAsm512K one_byte_repeat_emit_encodeBetterBlockAsm512K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm512K memmove_midrepeat_emit_encodeBetterBlockAsm512K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm512K emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm512K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm512K memmove_long_repeat_emit_encodeBetterBlockAsm512K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(CX)(R11*1) MOVOA X5, -16(CX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm512K: ADDL $0x05, AX MOVL AX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), SI // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K: MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K LEAL -16(DI), DI LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm512K matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm512K matchlen_match8_repeat_extend_encodeBetterBlockAsm512K: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm512K MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm512K matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm512K matchlen_match4_repeat_extend_encodeBetterBlockAsm512K: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm512K MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm512K LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm512K: CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm512K JB repeat_extend_forward_end_encodeBetterBlockAsm512K MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm512K LEAL 2(R10), R10 SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm512K matchlen_match1_repeat_extend_encodeBetterBlockAsm512K: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm512K LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm512K: ADDL R10, AX MOVL AX, SI SUBL BX, SI MOVL 16(SP), BX // emitRepeat LEAL -1(SI), BX CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm512K LEAL -30(SI), BX CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm512K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm512K MOVB $0xfc, (CX) MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm512K repeat_three_match_repeat_encodeBetterBlockAsm512K: MOVB $0xf4, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm512K repeat_two_match_repeat_encodeBetterBlockAsm512K: MOVB $0xec, (CX) MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm512K repeat_one_match_repeat_encodeBetterBlockAsm512K: XORL BX, BX LEAL -4(BX)(SI*8), BX MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm512K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm512K no_repeat_found_encodeBetterBlockAsm512K: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm512K CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm512K MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm512K candidateS_match_encodeBetterBlockAsm512K: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x30, R10 MOVL (BX)(R10*4), SI INCL AX MOVL AX, (BX)(R10*4) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm512K DECL AX MOVL R8, SI candidate_match_encodeBetterBlockAsm512K: MOVL 12(SP), BX TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm512K match_extend_back_loop_encodeBetterBlockAsm512K: CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm512K MOVB -1(DX)(SI*1), DI MOVB -1(DX)(AX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm512K LEAL -1(AX), AX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm512K JMP match_extend_back_loop_encodeBetterBlockAsm512K match_extend_back_end_encodeBetterBlockAsm512K: MOVL AX, BX SUBL 12(SP), BX LEAQ 4(CX)(BX*1), BX CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm512K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm512K: MOVL AX, BX ADDL $0x04, AX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), R9 // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K: MOVQ (R8)(R11*1), R10 MOVQ 8(R8)(R11*1), R12 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm512K matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm512K matchlen_match8_match_nolit_encodeBetterBlockAsm512K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm512K MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm512K matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm512K matchlen_match4_match_nolit_encodeBetterBlockAsm512K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm512K MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm512K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm512K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm512K JB match_nolit_end_encodeBetterBlockAsm512K MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm512K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm512K matchlen_match1_match_nolit_encodeBetterBlockAsm512K: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm512K LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm512K: MOVL AX, DI SUBL SI, DI CMPL R11, $0x01 JA match_length_ok_encodeBetterBlockAsm512K CMPL DI, $0x0001003f JBE match_length_ok_encodeBetterBlockAsm512K MOVL 20(SP), AX INCL AX JMP search_loop_encodeBetterBlockAsm512K match_length_ok_encodeBetterBlockAsm512K: MOVL DI, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R8 MOVL BX, SI SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm512K CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm512K CMPL DI, $0x0001003f JA match_emit_copy3_encodeBetterBlockAsm512K CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm512K MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, DI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, DI CMOVLLT R11, DI LEAL -1(SI)(DI*4), DI MOVL $0x00000003, R10 LEAL (R10)(DI*8), DI MOVB DI, (CX) ADDQ $0x03, CX MOVL R8, (CX) ADDQ SI, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm512K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm512K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm512K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm512K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm512K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm512K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm512K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K match_emit_copy3_encodeBetterBlockAsm512K: CMPL SI, $0x03 JA match_emit_lits_encodeBetterBlockAsm512K MOVLQZX 12(SP), R8 MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy3 LEAL -4(R11), R11 LEAL -65536(DI), DI SHLL $0x0b, DI LEAL 7(DI)(SI*8), DI CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm512K LEAL -60(R11), R9 CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm512K CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm512K ADDL $0x000007e0, DI MOVL DI, (CX) MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBetterBlockAsm512K emit_copy3_2_match_emit_lits_encodeBetterBlockAsm512K: ADDL $0x000007c0, DI MOVL DI, (CX) MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBetterBlockAsm512K emit_copy3_1_match_emit_lits_encodeBetterBlockAsm512K: ADDL $0x000007a0, DI MOVL DI, (CX) MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBetterBlockAsm512K emit_copy3_0_match_emit_lits_encodeBetterBlockAsm512K: SHLL $0x05, R11 ORL R11, DI MOVL DI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBetterBlockAsm512K: MOVL R8, (CX) ADDQ SI, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K match_emit_lits_encodeBetterBlockAsm512K: LEAQ (DX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm512K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm512K CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm512K MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm512K three_bytes_match_emit_encodeBetterBlockAsm512K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm512K two_bytes_match_emit_encodeBetterBlockAsm512K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm512K JMP memmove_long_match_emit_encodeBetterBlockAsm512K one_byte_match_emit_encodeBetterBlockAsm512K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 1 CMPQ SI, $0x08 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8 CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R10 MOVQ -8(R8)(SI*1), R8 MOVQ R10, (CX) MOVQ R8, -8(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm512K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm512K memmove_midmatch_emit_encodeBetterBlockAsm512K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm512K emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm512K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm512K memmove_long_match_emit_encodeBetterBlockAsm512K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm512K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy CMPL DI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBetterBlockAsm512K // emitCopy3 LEAL -4(R11), R11 LEAL -65536(DI), SI SHLL $0x0b, SI ADDL $0x07, SI CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm512K_emit3 LEAL -60(R11), DI CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBetterBlockAsm512K_emit3 CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBetterBlockAsm512K_emit3 ADDL $0x000007e0, SI MOVL SI, (CX) MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy3_2_match_nolit_encodeBetterBlockAsm512K_emit3: ADDL $0x000007c0, SI MOVL SI, (CX) MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy3_1_match_nolit_encodeBetterBlockAsm512K_emit3: ADDL $0x000007a0, SI MOVL SI, (CX) MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy3_0_match_nolit_encodeBetterBlockAsm512K_emit3: SHLL $0x05, R11 ORL R11, SI MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K two_byte_offset_match_nolit_encodeBetterBlockAsm512K: CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm512K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm512K LEAL -1(DI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_one_longer_match_nolit_encodeBetterBlockAsm512K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm512K LEAL -1(DI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy1_repeat_match_nolit_encodeBetterBlockAsm512K: LEAL -1(DI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K two_byte_match_nolit_encodeBetterBlockAsm512K: // emitCopy2 LEAL -64(DI), DI LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm512K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm512K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm512K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy2_2_match_nolit_encodeBetterBlockAsm512K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy2_1_match_nolit_encodeBetterBlockAsm512K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K emit_copy2_0_match_nolit_encodeBetterBlockAsm512K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm512K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm512K CMPL SI, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm512K MOVL SI, R9 SHRL $0x10, R9 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R9, 3(CX) ADDQ $0x04, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm512K three_bytes_match_emit_repeat_encodeBetterBlockAsm512K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm512K two_bytes_match_emit_repeat_encodeBetterBlockAsm512K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm512K JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm512K one_byte_match_emit_repeat_encodeBetterBlockAsm512K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K memmove_midmatch_emit_repeat_encodeBetterBlockAsm512K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm512K emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm512K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K memmove_long_match_emit_repeat_encodeBetterBlockAsm512K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm512K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm512K CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm512K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_three_match_nolit_repeat_encodeBetterBlockAsm512K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_two_match_nolit_repeat_encodeBetterBlockAsm512K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K repeat_one_match_nolit_repeat_encodeBetterBlockAsm512K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm512K: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm512K CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm512K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm512K: MOVQ tmp+48(FP), SI MOVQ $0x00cf1bbcdcbfa563, DI MOVQ $0x9e3779b1, R8 LEAQ 1(BX), BX LEAQ -2(AX), R9 MOVQ (DX)(BX*1), R10 MOVQ 1(DX)(BX*1), R11 MOVQ (DX)(R9*1), R12 MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ DI, R10 SHRQ $0x30, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x33, R11 SHLQ $0x08, R12 IMULQ DI, R12 SHRQ $0x30, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x33, R13 LEAQ 1(BX), R8 LEAQ 1(R9), R14 MOVL BX, (SI)(R10*4) MOVL R9, (SI)(R12*4) LEAQ 1(R9)(BX*1), R10 SHRQ $0x01, R10 ADDQ $0x01, BX SUBQ $0x01, R9 MOVL R8, 262144(SI)(R11*4) MOVL R14, 262144(SI)(R13*4) index_loop_encodeBetterBlockAsm512K: CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm512K MOVQ (DX)(BX*1), R8 MOVQ (DX)(R10*1), R11 SHLQ $0x08, R8 IMULQ DI, R8 SHRQ $0x30, R8 SHLQ $0x08, R11 IMULQ DI, R11 SHRQ $0x30, R11 MOVL BX, (SI)(R8*4) MOVL R9, (SI)(R11*4) ADDQ $0x02, BX ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm512K emit_remainder_encodeBetterBlockAsm512K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 4(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm512K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm512K: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm512K MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm512K SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm512K CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBetterBlockAsm512K MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm512K three_bytes_emit_remainder_encodeBetterBlockAsm512K: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm512K two_bytes_emit_remainder_encodeBetterBlockAsm512K: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm512K JMP memmove_long_emit_remainder_encodeBetterBlockAsm512K one_byte_emit_remainder_encodeBetterBlockAsm512K: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm512K memmove_midemit_remainder_encodeBetterBlockAsm512K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm512K emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm512K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm512K memmove_long_emit_remainder_encodeBetterBlockAsm512K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm512K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm64K(dst []byte, src []byte, tmp *[73728]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm64K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000240, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm64K: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm64K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -11(AX), DX LEAQ -8(AX), BX MOVL BX, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm64K: MOVQ tmp+48(FP), BX MOVL AX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 1(AX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm64K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x34, R11 MOVWLZX (BX)(R10*2), SI MOVWLZX 65536(BX)(R11*2), R8 MOVW AX, (BX)(R10*2) MOVW AX, 65536(BX)(R11*2) MOVQ (DX)(SI*1), R10 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm64K MOVQ (DX)(R8*1), R11 CMPQ R11, DI MOVL AX, R12 SUBL 16(SP), R12 MOVQ (DX)(R12*1), R12 MOVQ $0x000000ffffffff00, R13 XORQ DI, R12 TESTQ R13, R12 JNE no_repeat_found_encodeBetterBlockAsm64K LEAL 1(AX), BX MOVL 12(SP), SI MOVL BX, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeBetterBlockAsm64K repeat_extend_back_loop_encodeBetterBlockAsm64K: CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm64K MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(BX*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm64K LEAL -1(BX), BX DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm64K repeat_extend_back_end_encodeBetterBlockAsm64K: MOVL BX, SI SUBL 12(SP), SI LEAQ 4(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm64K: // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm64K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm64K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm64K JB three_bytes_repeat_emit_encodeBetterBlockAsm64K MOVL SI, R9 SHRL $0x10, R9 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R9, 3(CX) ADDQ $0x04, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm64K three_bytes_repeat_emit_encodeBetterBlockAsm64K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm64K two_bytes_repeat_emit_encodeBetterBlockAsm64K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm64K JMP memmove_long_repeat_emit_encodeBetterBlockAsm64K one_byte_repeat_emit_encodeBetterBlockAsm64K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm64K memmove_midrepeat_emit_encodeBetterBlockAsm64K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm64K emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm64K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm64K memmove_long_repeat_emit_encodeBetterBlockAsm64K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(CX)(R11*1) MOVOA X5, -16(CX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm64K: ADDL $0x05, AX MOVL AX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), SI // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K: MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K LEAL -16(DI), DI LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm64K matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm64K matchlen_match8_repeat_extend_encodeBetterBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm64K MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm64K matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm64K matchlen_match4_repeat_extend_encodeBetterBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm64K MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm64K LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm64K: CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm64K JB repeat_extend_forward_end_encodeBetterBlockAsm64K MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm64K LEAL 2(R10), R10 SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm64K matchlen_match1_repeat_extend_encodeBetterBlockAsm64K: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm64K LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm64K: ADDL R10, AX MOVL AX, SI SUBL BX, SI MOVL 16(SP), BX // emitRepeat LEAL -1(SI), BX CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm64K LEAL -30(SI), BX CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm64K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm64K MOVB $0xfc, (CX) MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm64K repeat_three_match_repeat_encodeBetterBlockAsm64K: MOVB $0xf4, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm64K repeat_two_match_repeat_encodeBetterBlockAsm64K: MOVB $0xec, (CX) MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm64K repeat_one_match_repeat_encodeBetterBlockAsm64K: XORL BX, BX LEAL -4(BX)(SI*8), BX MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm64K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm64K no_repeat_found_encodeBetterBlockAsm64K: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm64K CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm64K MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm64K candidateS_match_encodeBetterBlockAsm64K: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 MOVWLZX (BX)(R10*2), SI INCL AX MOVW AX, (BX)(R10*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm64K DECL AX MOVL R8, SI candidate_match_encodeBetterBlockAsm64K: MOVL 12(SP), BX TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm64K match_extend_back_loop_encodeBetterBlockAsm64K: CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm64K MOVB -1(DX)(SI*1), DI MOVB -1(DX)(AX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm64K LEAL -1(AX), AX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm64K JMP match_extend_back_loop_encodeBetterBlockAsm64K match_extend_back_end_encodeBetterBlockAsm64K: MOVL AX, BX SUBL 12(SP), BX LEAQ 4(CX)(BX*1), BX CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm64K: MOVL AX, BX ADDL $0x04, AX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), R9 // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K: MOVQ (R8)(R11*1), R10 MOVQ 8(R8)(R11*1), R12 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm64K matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm64K matchlen_match8_match_nolit_encodeBetterBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm64K MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm64K matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm64K matchlen_match4_match_nolit_encodeBetterBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm64K MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm64K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm64K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm64K JB match_nolit_end_encodeBetterBlockAsm64K MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm64K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm64K matchlen_match1_match_nolit_encodeBetterBlockAsm64K: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm64K LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm64K: MOVL AX, DI SUBL SI, DI MOVL DI, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R8 MOVL BX, SI SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm64K CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm64K CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm64K MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, DI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, DI CMOVLLT R11, DI LEAL -1(SI)(DI*4), DI MOVL $0x00000003, R10 LEAL (R10)(DI*8), DI MOVB DI, (CX) ADDQ $0x03, CX MOVL R8, (CX) ADDQ SI, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm64K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm64K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm64K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm64K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm64K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm64K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K match_emit_lits_encodeBetterBlockAsm64K: LEAQ (DX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm64K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm64K JB three_bytes_match_emit_encodeBetterBlockAsm64K MOVL R9, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R9, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm64K three_bytes_match_emit_encodeBetterBlockAsm64K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm64K two_bytes_match_emit_encodeBetterBlockAsm64K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm64K JMP memmove_long_match_emit_encodeBetterBlockAsm64K one_byte_match_emit_encodeBetterBlockAsm64K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 1 CMPQ SI, $0x08 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8 CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R10 MOVQ -8(R8)(SI*1), R8 MOVQ R10, (CX) MOVQ R8, -8(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm64K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm64K memmove_midmatch_emit_encodeBetterBlockAsm64K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm64K emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm64K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm64K memmove_long_match_emit_encodeBetterBlockAsm64K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm64K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm64K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm64K LEAL -1(DI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K emit_one_longer_match_nolit_encodeBetterBlockAsm64K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm64K LEAL -1(DI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K emit_copy1_repeat_match_nolit_encodeBetterBlockAsm64K: LEAL -1(DI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K two_byte_match_nolit_encodeBetterBlockAsm64K: // emitCopy2 LEAL -64(DI), DI LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm64K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm64K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm64K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K emit_copy2_2_match_nolit_encodeBetterBlockAsm64K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K emit_copy2_1_match_nolit_encodeBetterBlockAsm64K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K emit_copy2_0_match_nolit_encodeBetterBlockAsm64K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm64K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm64K JB three_bytes_match_emit_repeat_encodeBetterBlockAsm64K MOVL SI, R9 SHRL $0x10, R9 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R9, 3(CX) ADDQ $0x04, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm64K three_bytes_match_emit_repeat_encodeBetterBlockAsm64K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm64K two_bytes_match_emit_repeat_encodeBetterBlockAsm64K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm64K JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm64K one_byte_match_emit_repeat_encodeBetterBlockAsm64K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K memmove_midmatch_emit_repeat_encodeBetterBlockAsm64K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm64K emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm64K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K memmove_long_match_emit_repeat_encodeBetterBlockAsm64K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm64K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm64K CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm64K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_three_match_nolit_repeat_encodeBetterBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_two_match_nolit_repeat_encodeBetterBlockAsm64K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K repeat_one_match_nolit_repeat_encodeBetterBlockAsm64K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm64K: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm64K CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm64K: MOVQ tmp+48(FP), SI MOVQ $0x0000cf1bbcdcbf9b, DI MOVQ $0x9e3779b1, R8 LEAQ 1(BX), BX LEAQ -2(AX), R9 MOVQ (DX)(BX*1), R10 MOVQ 1(DX)(BX*1), R11 MOVQ (DX)(R9*1), R12 MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x31, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x31, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x34, R13 LEAQ 1(BX), R8 LEAQ 1(R9), R14 MOVW BX, (SI)(R10*2) MOVW R9, (SI)(R12*2) LEAQ 1(R9)(BX*1), R10 SHRQ $0x01, R10 ADDQ $0x01, BX SUBQ $0x01, R9 MOVW R8, 65536(SI)(R11*2) MOVW R14, 65536(SI)(R13*2) index_loop_encodeBetterBlockAsm64K: CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm64K MOVQ (DX)(BX*1), R8 MOVQ (DX)(R10*1), R11 SHLQ $0x10, R8 IMULQ DI, R8 SHRQ $0x31, R8 SHLQ $0x10, R11 IMULQ DI, R11 SHRQ $0x31, R11 MOVW BX, (SI)(R8*2) MOVW R9, (SI)(R11*2) ADDQ $0x02, BX ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm64K emit_remainder_encodeBetterBlockAsm64K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 4(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm64K: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm64K MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm64K SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm64K JB three_bytes_emit_remainder_encodeBetterBlockAsm64K MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm64K three_bytes_emit_remainder_encodeBetterBlockAsm64K: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm64K two_bytes_emit_remainder_encodeBetterBlockAsm64K: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm64K JMP memmove_long_emit_remainder_encodeBetterBlockAsm64K one_byte_emit_remainder_encodeBetterBlockAsm64K: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm64K memmove_midemit_remainder_encodeBetterBlockAsm64K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm64K emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm64K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm64K memmove_long_emit_remainder_encodeBetterBlockAsm64K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm64K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm16K(dst []byte, src []byte, tmp *[36864]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm16K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000120, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm16K: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm16K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -11(AX), DX LEAQ -8(AX), BX MOVL BX, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm16K: MOVQ tmp+48(FP), BX MOVL AX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 1(AX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm16K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x35, R11 MOVWLZX (BX)(R10*2), SI MOVWLZX 32768(BX)(R11*2), R8 MOVW AX, (BX)(R10*2) MOVW AX, 32768(BX)(R11*2) MOVQ (DX)(SI*1), R10 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm16K MOVQ (DX)(R8*1), R11 CMPQ R11, DI MOVL AX, R12 SUBL 16(SP), R12 MOVQ (DX)(R12*1), R12 MOVQ $0x000000ffffffff00, R13 XORQ DI, R12 TESTQ R13, R12 JNE no_repeat_found_encodeBetterBlockAsm16K LEAL 1(AX), BX MOVL 12(SP), SI MOVL BX, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeBetterBlockAsm16K repeat_extend_back_loop_encodeBetterBlockAsm16K: CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm16K MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(BX*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm16K LEAL -1(BX), BX DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm16K repeat_extend_back_end_encodeBetterBlockAsm16K: MOVL BX, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm16K MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm16K: // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm16K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm16K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm16K JB three_bytes_repeat_emit_encodeBetterBlockAsm16K three_bytes_repeat_emit_encodeBetterBlockAsm16K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm16K two_bytes_repeat_emit_encodeBetterBlockAsm16K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm16K JMP memmove_long_repeat_emit_encodeBetterBlockAsm16K one_byte_repeat_emit_encodeBetterBlockAsm16K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm16K memmove_midrepeat_emit_encodeBetterBlockAsm16K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm16K emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm16K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm16K memmove_long_repeat_emit_encodeBetterBlockAsm16K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(CX)(R11*1) MOVOA X5, -16(CX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm16K: ADDL $0x05, AX MOVL AX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), SI // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K: MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K LEAL -16(DI), DI LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm16K matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm16K matchlen_match8_repeat_extend_encodeBetterBlockAsm16K: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm16K MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm16K matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm16K matchlen_match4_repeat_extend_encodeBetterBlockAsm16K: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm16K MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm16K LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm16K: CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm16K JB repeat_extend_forward_end_encodeBetterBlockAsm16K MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm16K LEAL 2(R10), R10 SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm16K matchlen_match1_repeat_extend_encodeBetterBlockAsm16K: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm16K LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm16K: ADDL R10, AX MOVL AX, SI SUBL BX, SI MOVL 16(SP), BX // emitRepeat LEAL -1(SI), BX CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm16K LEAL -30(SI), BX CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm16K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm16K MOVB $0xfc, (CX) MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm16K repeat_three_match_repeat_encodeBetterBlockAsm16K: MOVB $0xf4, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm16K repeat_two_match_repeat_encodeBetterBlockAsm16K: MOVB $0xec, (CX) MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm16K repeat_one_match_repeat_encodeBetterBlockAsm16K: XORL BX, BX LEAL -4(BX)(SI*8), BX MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm16K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm16K no_repeat_found_encodeBetterBlockAsm16K: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm16K CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm16K MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm16K candidateS_match_encodeBetterBlockAsm16K: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVWLZX (BX)(R10*2), SI INCL AX MOVW AX, (BX)(R10*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm16K DECL AX MOVL R8, SI candidate_match_encodeBetterBlockAsm16K: MOVL 12(SP), BX TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm16K match_extend_back_loop_encodeBetterBlockAsm16K: CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm16K MOVB -1(DX)(SI*1), DI MOVB -1(DX)(AX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm16K LEAL -1(AX), AX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm16K JMP match_extend_back_loop_encodeBetterBlockAsm16K match_extend_back_end_encodeBetterBlockAsm16K: MOVL AX, BX SUBL 12(SP), BX LEAQ 3(CX)(BX*1), BX CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm16K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm16K: MOVL AX, BX ADDL $0x04, AX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), R9 // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K: MOVQ (R8)(R11*1), R10 MOVQ 8(R8)(R11*1), R12 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm16K matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm16K matchlen_match8_match_nolit_encodeBetterBlockAsm16K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm16K MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm16K matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm16K matchlen_match4_match_nolit_encodeBetterBlockAsm16K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm16K MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm16K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm16K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm16K JB match_nolit_end_encodeBetterBlockAsm16K MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm16K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm16K matchlen_match1_match_nolit_encodeBetterBlockAsm16K: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm16K LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm16K: MOVL AX, DI SUBL SI, DI MOVL DI, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R8 MOVL BX, SI SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm16K CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm16K CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm16K MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, DI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, DI CMOVLLT R11, DI LEAL -1(SI)(DI*4), DI MOVL $0x00000003, R10 LEAL (R10)(DI*8), DI MOVB DI, (CX) ADDQ $0x03, CX MOVL R8, (CX) ADDQ SI, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm16K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm16K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm16K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm16K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm16K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm16K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm16K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K match_emit_lits_encodeBetterBlockAsm16K: LEAQ (DX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm16K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm16K JB three_bytes_match_emit_encodeBetterBlockAsm16K three_bytes_match_emit_encodeBetterBlockAsm16K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm16K two_bytes_match_emit_encodeBetterBlockAsm16K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm16K JMP memmove_long_match_emit_encodeBetterBlockAsm16K one_byte_match_emit_encodeBetterBlockAsm16K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 1 CMPQ SI, $0x08 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8 CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R10 MOVQ -8(R8)(SI*1), R8 MOVQ R10, (CX) MOVQ R8, -8(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm16K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm16K memmove_midmatch_emit_encodeBetterBlockAsm16K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm16K emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm16K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm16K memmove_long_match_emit_encodeBetterBlockAsm16K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm16K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm16K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm16K LEAL -1(DI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K emit_one_longer_match_nolit_encodeBetterBlockAsm16K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm16K LEAL -1(DI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K emit_copy1_repeat_match_nolit_encodeBetterBlockAsm16K: LEAL -1(DI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K two_byte_match_nolit_encodeBetterBlockAsm16K: // emitCopy2 LEAL -64(DI), DI LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm16K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm16K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm16K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K emit_copy2_2_match_nolit_encodeBetterBlockAsm16K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K emit_copy2_1_match_nolit_encodeBetterBlockAsm16K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K emit_copy2_0_match_nolit_encodeBetterBlockAsm16K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm16K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm16K JB three_bytes_match_emit_repeat_encodeBetterBlockAsm16K three_bytes_match_emit_repeat_encodeBetterBlockAsm16K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm16K two_bytes_match_emit_repeat_encodeBetterBlockAsm16K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm16K JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm16K one_byte_match_emit_repeat_encodeBetterBlockAsm16K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K memmove_midmatch_emit_repeat_encodeBetterBlockAsm16K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm16K emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm16K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K memmove_long_match_emit_repeat_encodeBetterBlockAsm16K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm16K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm16K CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm16K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_three_match_nolit_repeat_encodeBetterBlockAsm16K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_two_match_nolit_repeat_encodeBetterBlockAsm16K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K repeat_one_match_nolit_repeat_encodeBetterBlockAsm16K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm16K: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm16K CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm16K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm16K: MOVQ tmp+48(FP), SI MOVQ $0x0000cf1bbcdcbf9b, DI MOVQ $0x9e3779b1, R8 LEAQ 1(BX), BX LEAQ -2(AX), R9 MOVQ (DX)(BX*1), R10 MOVQ 1(DX)(BX*1), R11 MOVQ (DX)(R9*1), R12 MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x32, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x35, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x32, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x35, R13 LEAQ 1(BX), R8 LEAQ 1(R9), R14 MOVW BX, (SI)(R10*2) MOVW R9, (SI)(R12*2) LEAQ 1(R9)(BX*1), R10 SHRQ $0x01, R10 ADDQ $0x01, BX SUBQ $0x01, R9 MOVW R8, 32768(SI)(R11*2) MOVW R14, 32768(SI)(R13*2) index_loop_encodeBetterBlockAsm16K: CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm16K MOVQ (DX)(BX*1), R8 MOVQ (DX)(R10*1), R11 SHLQ $0x10, R8 IMULQ DI, R8 SHRQ $0x32, R8 SHLQ $0x10, R11 IMULQ DI, R11 SHRQ $0x32, R11 MOVW BX, (SI)(R8*2) MOVW R9, (SI)(R11*2) ADDQ $0x02, BX ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm16K emit_remainder_encodeBetterBlockAsm16K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm16K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm16K: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm16K MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm16K SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm16K JB three_bytes_emit_remainder_encodeBetterBlockAsm16K three_bytes_emit_remainder_encodeBetterBlockAsm16K: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm16K two_bytes_emit_remainder_encodeBetterBlockAsm16K: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm16K JMP memmove_long_emit_remainder_encodeBetterBlockAsm16K one_byte_emit_remainder_encodeBetterBlockAsm16K: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm16K memmove_midemit_remainder_encodeBetterBlockAsm16K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm16K emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm16K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm16K memmove_long_emit_remainder_encodeBetterBlockAsm16K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm16K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm4K(dst []byte, src []byte, tmp *[10240]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm4K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000050, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm4K: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm4K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -11(AX), DX LEAQ -8(AX), BX MOVL BX, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm4K: MOVQ tmp+48(FP), BX MOVL AX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 1(AX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm4K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x36, R11 MOVWLZX (BX)(R10*2), SI MOVWLZX 8192(BX)(R11*2), R8 MOVW AX, (BX)(R10*2) MOVW AX, 8192(BX)(R11*2) MOVQ (DX)(SI*1), R10 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm4K MOVQ (DX)(R8*1), R11 CMPQ R11, DI MOVL AX, R12 SUBL 16(SP), R12 MOVQ (DX)(R12*1), R12 MOVQ $0x000000ffffffff00, R13 XORQ DI, R12 TESTQ R13, R12 JNE no_repeat_found_encodeBetterBlockAsm4K LEAL 1(AX), BX MOVL 12(SP), SI MOVL BX, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeBetterBlockAsm4K repeat_extend_back_loop_encodeBetterBlockAsm4K: CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm4K MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(BX*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm4K LEAL -1(BX), BX DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm4K repeat_extend_back_end_encodeBetterBlockAsm4K: MOVL BX, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm4K MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm4K: // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm4K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm4K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm4K JB three_bytes_repeat_emit_encodeBetterBlockAsm4K three_bytes_repeat_emit_encodeBetterBlockAsm4K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm4K two_bytes_repeat_emit_encodeBetterBlockAsm4K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm4K JMP memmove_long_repeat_emit_encodeBetterBlockAsm4K one_byte_repeat_emit_encodeBetterBlockAsm4K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4K memmove_midrepeat_emit_encodeBetterBlockAsm4K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm4K emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm4K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4K memmove_long_repeat_emit_encodeBetterBlockAsm4K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(CX)(R11*1) MOVOA X5, -16(CX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm4K: ADDL $0x05, AX MOVL AX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), SI // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K: MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K LEAL -16(DI), DI LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm4K matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm4K matchlen_match8_repeat_extend_encodeBetterBlockAsm4K: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm4K MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm4K matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm4K matchlen_match4_repeat_extend_encodeBetterBlockAsm4K: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm4K MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm4K LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm4K: CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm4K JB repeat_extend_forward_end_encodeBetterBlockAsm4K MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm4K LEAL 2(R10), R10 SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm4K matchlen_match1_repeat_extend_encodeBetterBlockAsm4K: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm4K LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm4K: ADDL R10, AX MOVL AX, SI SUBL BX, SI MOVL 16(SP), BX // emitRepeat LEAL -1(SI), BX CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm4K LEAL -30(SI), BX CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm4K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm4K MOVB $0xfc, (CX) MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm4K repeat_three_match_repeat_encodeBetterBlockAsm4K: MOVB $0xf4, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm4K repeat_two_match_repeat_encodeBetterBlockAsm4K: MOVB $0xec, (CX) MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm4K repeat_one_match_repeat_encodeBetterBlockAsm4K: XORL BX, BX LEAL -4(BX)(SI*8), BX MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm4K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm4K no_repeat_found_encodeBetterBlockAsm4K: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm4K CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm4K MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm4K candidateS_match_encodeBetterBlockAsm4K: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 MOVWLZX (BX)(R10*2), SI INCL AX MOVW AX, (BX)(R10*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm4K DECL AX MOVL R8, SI candidate_match_encodeBetterBlockAsm4K: MOVL 12(SP), BX TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm4K match_extend_back_loop_encodeBetterBlockAsm4K: CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm4K MOVB -1(DX)(SI*1), DI MOVB -1(DX)(AX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm4K LEAL -1(AX), AX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm4K JMP match_extend_back_loop_encodeBetterBlockAsm4K match_extend_back_end_encodeBetterBlockAsm4K: MOVL AX, BX SUBL 12(SP), BX LEAQ 3(CX)(BX*1), BX CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm4K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm4K: MOVL AX, BX ADDL $0x04, AX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), R9 // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K: MOVQ (R8)(R11*1), R10 MOVQ 8(R8)(R11*1), R12 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm4K matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm4K matchlen_match8_match_nolit_encodeBetterBlockAsm4K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4K MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4K matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm4K matchlen_match4_match_nolit_encodeBetterBlockAsm4K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4K MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm4K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm4K JB match_nolit_end_encodeBetterBlockAsm4K MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm4K matchlen_match1_match_nolit_encodeBetterBlockAsm4K: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm4K LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm4K: MOVL AX, DI SUBL SI, DI MOVL DI, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R8 MOVL BX, SI SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm4K CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm4K CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm4K MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, DI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, DI CMOVLLT R11, DI LEAL -1(SI)(DI*4), DI MOVL $0x00000003, R10 LEAL (R10)(DI*8), DI MOVB DI, (CX) ADDQ $0x03, CX MOVL R8, (CX) ADDQ SI, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm4K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm4K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm4K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm4K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm4K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm4K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K match_emit_lits_encodeBetterBlockAsm4K: LEAQ (DX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm4K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm4K JB three_bytes_match_emit_encodeBetterBlockAsm4K three_bytes_match_emit_encodeBetterBlockAsm4K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm4K two_bytes_match_emit_encodeBetterBlockAsm4K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm4K JMP memmove_long_match_emit_encodeBetterBlockAsm4K one_byte_match_emit_encodeBetterBlockAsm4K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 1 CMPQ SI, $0x08 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8 CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R10 MOVQ -8(R8)(SI*1), R8 MOVQ R10, (CX) MOVQ R8, -8(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm4K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm4K memmove_midmatch_emit_encodeBetterBlockAsm4K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm4K emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm4K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm4K memmove_long_match_emit_encodeBetterBlockAsm4K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm4K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm4K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm4K LEAL -1(DI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K emit_one_longer_match_nolit_encodeBetterBlockAsm4K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm4K LEAL -1(DI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K emit_copy1_repeat_match_nolit_encodeBetterBlockAsm4K: LEAL -1(DI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K two_byte_match_nolit_encodeBetterBlockAsm4K: // emitCopy2 LEAL -64(DI), DI LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm4K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm4K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm4K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K emit_copy2_2_match_nolit_encodeBetterBlockAsm4K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K emit_copy2_1_match_nolit_encodeBetterBlockAsm4K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K emit_copy2_0_match_nolit_encodeBetterBlockAsm4K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm4K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4K JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4K three_bytes_match_emit_repeat_encodeBetterBlockAsm4K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4K two_bytes_match_emit_repeat_encodeBetterBlockAsm4K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm4K JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4K one_byte_match_emit_repeat_encodeBetterBlockAsm4K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K memmove_midmatch_emit_repeat_encodeBetterBlockAsm4K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm4K emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm4K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K memmove_long_match_emit_repeat_encodeBetterBlockAsm4K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm4K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm4K CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_three_match_nolit_repeat_encodeBetterBlockAsm4K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_two_match_nolit_repeat_encodeBetterBlockAsm4K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K repeat_one_match_nolit_repeat_encodeBetterBlockAsm4K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm4K: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm4K CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm4K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm4K: MOVQ tmp+48(FP), SI MOVQ $0x0000cf1bbcdcbf9b, DI MOVQ $0x9e3779b1, R8 LEAQ 1(BX), BX LEAQ -2(AX), R9 MOVQ (DX)(BX*1), R10 MOVQ 1(DX)(BX*1), R11 MOVQ (DX)(R9*1), R12 MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x34, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x34, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x36, R13 LEAQ 1(BX), R8 LEAQ 1(R9), R14 MOVW BX, (SI)(R10*2) MOVW R9, (SI)(R12*2) LEAQ 1(R9)(BX*1), R10 SHRQ $0x01, R10 ADDQ $0x01, BX SUBQ $0x01, R9 MOVW R8, 8192(SI)(R11*2) MOVW R14, 8192(SI)(R13*2) index_loop_encodeBetterBlockAsm4K: CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm4K MOVQ (DX)(BX*1), R8 MOVQ (DX)(R10*1), R11 SHLQ $0x10, R8 IMULQ DI, R8 SHRQ $0x34, R8 SHLQ $0x10, R11 IMULQ DI, R11 SHRQ $0x34, R11 MOVW BX, (SI)(R8*2) MOVW R9, (SI)(R11*2) ADDQ $0x02, BX ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm4K emit_remainder_encodeBetterBlockAsm4K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm4K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm4K: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4K MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm4K SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm4K JB three_bytes_emit_remainder_encodeBetterBlockAsm4K three_bytes_emit_remainder_encodeBetterBlockAsm4K: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm4K two_bytes_emit_remainder_encodeBetterBlockAsm4K: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm4K JMP memmove_long_emit_remainder_encodeBetterBlockAsm4K one_byte_emit_remainder_encodeBetterBlockAsm4K: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4K memmove_midemit_remainder_encodeBetterBlockAsm4K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm4K emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm4K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4K memmove_long_emit_remainder_encodeBetterBlockAsm4K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm4K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm1K(dst []byte, src []byte, tmp *[4608]byte) int // Requires: BMI, CMOV, SSE2 TEXT ·encodeBetterBlockAsm1K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000024, DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm1K: MOVOU X0, (AX) MOVOU X0, 16(AX) MOVOU X0, 32(AX) MOVOU X0, 48(AX) MOVOU X0, 64(AX) MOVOU X0, 80(AX) MOVOU X0, 96(AX) MOVOU X0, 112(AX) ADDQ $0x80, AX DECQ DX JNZ zero_loop_encodeBetterBlockAsm1K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), AX LEAQ -11(AX), DX LEAQ -8(AX), BX MOVL BX, 8(SP) SHRQ $0x05, AX SUBL AX, DX LEAQ (CX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, AX MOVL AX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm1K: MOVQ tmp+48(FP), BX MOVL AX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 1(AX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm1K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x35, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x38, R11 MOVWLZX (BX)(R10*2), SI MOVWLZX 4096(BX)(R11*2), R8 MOVW AX, (BX)(R10*2) MOVW AX, 4096(BX)(R11*2) MOVQ (DX)(SI*1), R10 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm1K MOVQ (DX)(R8*1), R11 CMPQ R11, DI MOVL AX, R12 SUBL 16(SP), R12 MOVQ (DX)(R12*1), R12 MOVQ $0x000000ffffffff00, R13 XORQ DI, R12 TESTQ R13, R12 JNE no_repeat_found_encodeBetterBlockAsm1K LEAL 1(AX), BX MOVL 12(SP), SI MOVL BX, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeBetterBlockAsm1K repeat_extend_back_loop_encodeBetterBlockAsm1K: CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm1K MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(BX*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm1K LEAL -1(BX), BX DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm1K repeat_extend_back_end_encodeBetterBlockAsm1K: MOVL BX, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm1K MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm1K: // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm1K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm1K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm1K JB three_bytes_repeat_emit_encodeBetterBlockAsm1K three_bytes_repeat_emit_encodeBetterBlockAsm1K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm1K two_bytes_repeat_emit_encodeBetterBlockAsm1K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm1K JMP memmove_long_repeat_emit_encodeBetterBlockAsm1K one_byte_repeat_emit_encodeBetterBlockAsm1K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm1K memmove_midrepeat_emit_encodeBetterBlockAsm1K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm1K emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm1K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm1K memmove_long_repeat_emit_encodeBetterBlockAsm1K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(CX)(R11*1) MOVOA X5, -16(CX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm1K: ADDL $0x05, AX MOVL AX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), SI // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K: MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K LEAL -16(DI), DI LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm1K matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm1K matchlen_match8_repeat_extend_encodeBetterBlockAsm1K: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm1K MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm1K matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm1K matchlen_match4_repeat_extend_encodeBetterBlockAsm1K: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm1K MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm1K LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm1K: CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm1K JB repeat_extend_forward_end_encodeBetterBlockAsm1K MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm1K LEAL 2(R10), R10 SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm1K matchlen_match1_repeat_extend_encodeBetterBlockAsm1K: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm1K LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm1K: ADDL R10, AX MOVL AX, SI SUBL BX, SI MOVL 16(SP), BX // emitRepeat LEAL -1(SI), BX CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm1K LEAL -30(SI), BX CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm1K CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm1K MOVB $0xfc, (CX) MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm1K repeat_three_match_repeat_encodeBetterBlockAsm1K: MOVB $0xf4, (CX) MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm1K repeat_two_match_repeat_encodeBetterBlockAsm1K: MOVB $0xec, (CX) MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm1K repeat_one_match_repeat_encodeBetterBlockAsm1K: XORL BX, BX LEAL -4(BX)(SI*8), BX MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm1K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm1K no_repeat_found_encodeBetterBlockAsm1K: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm1K CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm1K MOVL 20(SP), AX JMP search_loop_encodeBetterBlockAsm1K candidateS_match_encodeBetterBlockAsm1K: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x35, R10 MOVWLZX (BX)(R10*2), SI INCL AX MOVW AX, (BX)(R10*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm1K DECL AX MOVL R8, SI candidate_match_encodeBetterBlockAsm1K: MOVL 12(SP), BX TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm1K match_extend_back_loop_encodeBetterBlockAsm1K: CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm1K MOVB -1(DX)(SI*1), DI MOVB -1(DX)(AX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm1K LEAL -1(AX), AX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm1K JMP match_extend_back_loop_encodeBetterBlockAsm1K match_extend_back_end_encodeBetterBlockAsm1K: MOVL AX, BX SUBL 12(SP), BX LEAQ 3(CX)(BX*1), BX CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm1K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm1K: MOVL AX, BX ADDL $0x04, AX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL AX, DI LEAQ (DX)(AX*1), R8 LEAQ (DX)(SI*1), R9 // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K: MOVQ (R8)(R11*1), R10 MOVQ 8(R8)(R11*1), R12 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K LEAL -16(DI), DI LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm1K matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm1K matchlen_match8_match_nolit_encodeBetterBlockAsm1K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm1K MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm1K matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm1K matchlen_match4_match_nolit_encodeBetterBlockAsm1K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm1K MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm1K LEAL -4(DI), DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm1K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm1K JB match_nolit_end_encodeBetterBlockAsm1K MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm1K LEAL 2(R11), R11 SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm1K matchlen_match1_match_nolit_encodeBetterBlockAsm1K: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm1K LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm1K: MOVL AX, DI SUBL SI, DI MOVL DI, 16(SP) // Check if we can combine lit+copy MOVLQZX 12(SP), R8 MOVL BX, SI SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm1K CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm1K CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm1K MOVL (DX)(R8*1), R8 ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits XORQ R9, R9 SUBL $0x40, DI LEAL -11(R11), R10 LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x07 CMOVLGE R10, R9 MOVQ $0x00000007, DI CMOVLLT R11, DI LEAL -1(SI)(DI*4), DI MOVL $0x00000003, R10 LEAL (R10)(DI*8), DI MOVB DI, (CX) ADDQ $0x03, CX MOVL R8, (CX) ADDQ SI, CX TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm1K // emitRepeat LEAL -1(R9), SI CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm1K LEAL -30(R9), SI CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm1K CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm1K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm1K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm1K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm1K: XORL SI, SI LEAL -4(SI)(R9*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K match_emit_lits_encodeBetterBlockAsm1K: LEAQ (DX)(R8*1), R8 // emitLiteral LEAL -1(SI), R9 CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm1K SUBL $0x1d, R9 CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm1K JB three_bytes_match_emit_encodeBetterBlockAsm1K three_bytes_match_emit_encodeBetterBlockAsm1K: MOVB $0xf0, (CX) MOVW R9, 1(CX) ADDQ $0x03, CX ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm1K two_bytes_match_emit_encodeBetterBlockAsm1K: MOVB $0xe8, (CX) MOVB R9, 1(CX) ADDL $0x1d, R9 ADDQ $0x02, CX CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm1K JMP memmove_long_match_emit_encodeBetterBlockAsm1K one_byte_match_emit_encodeBetterBlockAsm1K: SHLB $0x03, R9 MOVB R9, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 1 CMPQ SI, $0x08 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8 CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16 CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R10 MOVQ -8(R8)(SI*1), R8 MOVQ R10, (CX) MOVQ R8, -8(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm1K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm1K memmove_midmatch_emit_encodeBetterBlockAsm1K: LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 8, min move: 30 CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm1K emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm1K: MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm1K memmove_long_match_emit_encodeBetterBlockAsm1K: LEAQ (CX)(SI*1), R9 // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(SI*1), X2 MOVOU -16(R8)(SI*1), X3 MOVQ SI, R12 SHRQ $0x05, R12 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(SI*1) MOVOU X3, -16(CX)(SI*1) MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm1K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm1K CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm1K LEAL -1(DI), SI SHLL $0x06, SI LEAL -15(SI)(R11*4), SI MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K emit_one_longer_match_nolit_encodeBetterBlockAsm1K: CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm1K LEAL -1(DI), SI SHLL $0x06, SI LEAL 61(SI), SI MOVW SI, (CX) LEAL -18(R11), SI MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K emit_copy1_repeat_match_nolit_encodeBetterBlockAsm1K: LEAL -1(DI), SI SHLL $0x06, SI LEAL 57(SI), SI MOVW SI, (CX) ADDQ $0x02, CX SUBL $0x12, R11 // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K two_byte_match_nolit_encodeBetterBlockAsm1K: // emitCopy2 LEAL -64(DI), DI LEAL -4(R11), R11 MOVW DI, 1(CX) CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm1K_emit2 LEAL -60(R11), SI CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm1K_emit2 CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm1K_emit2 MOVB $0xfe, (CX) MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K emit_copy2_2_match_nolit_encodeBetterBlockAsm1K_emit2: MOVB $0xfa, (CX) MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K emit_copy2_1_match_nolit_encodeBetterBlockAsm1K_emit2: MOVB $0xf6, (CX) MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K emit_copy2_0_match_nolit_encodeBetterBlockAsm1K_emit2: MOVL $0x00000002, SI LEAL (SI)(R11*4), SI MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K // emitLiteralsDstP MOVL 12(SP), SI CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K MOVL BX, DI MOVL BX, 12(SP) LEAQ (DX)(SI*1), R8 SUBL SI, DI // emitLiteral LEAL -1(DI), SI CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm1K SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm1K JB three_bytes_match_emit_repeat_encodeBetterBlockAsm1K three_bytes_match_emit_repeat_encodeBetterBlockAsm1K: MOVB $0xf0, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm1K two_bytes_match_emit_repeat_encodeBetterBlockAsm1K: MOVB $0xe8, (CX) MOVB SI, 1(CX) ADDL $0x1d, SI ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm1K JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm1K one_byte_match_emit_repeat_encodeBetterBlockAsm1K: SHLB $0x03, SI MOVB SI, (CX) ADDQ $0x01, CX LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 1 CMPQ DI, $0x08 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (CX) MOVQ R8, -8(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K memmove_midmatch_emit_repeat_encodeBetterBlockAsm1K: LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 8, min move: 30 CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm1K emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm1K: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K memmove_long_match_emit_repeat_encodeBetterBlockAsm1K: LEAQ (CX)(DI*1), SI // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ CX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(DI*1) MOVOU X3, -16(CX)(DI*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K: ADDL R11, AX ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat LEAL -1(R11), SI CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm1K LEAL -30(R11), SI CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm1K CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm1K MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_three_match_nolit_repeat_encodeBetterBlockAsm1K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_two_match_nolit_repeat_encodeBetterBlockAsm1K: MOVB $0xec, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K repeat_one_match_nolit_repeat_encodeBetterBlockAsm1K: XORL SI, SI LEAL -4(SI)(R11*8), SI MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm1K: CMPL AX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm1K CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm1K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm1K: MOVQ tmp+48(FP), SI MOVQ $0x0000cf1bbcdcbf9b, DI MOVQ $0x9e3779b1, R8 LEAQ 1(BX), BX LEAQ -2(AX), R9 MOVQ (DX)(BX*1), R10 MOVQ 1(DX)(BX*1), R11 MOVQ (DX)(R9*1), R12 MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x35, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x35, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x38, R13 LEAQ 1(BX), R8 LEAQ 1(R9), R14 MOVW BX, (SI)(R10*2) MOVW R9, (SI)(R12*2) LEAQ 1(R9)(BX*1), R10 SHRQ $0x01, R10 ADDQ $0x01, BX SUBQ $0x01, R9 MOVW R8, 4096(SI)(R11*2) MOVW R14, 4096(SI)(R13*2) index_loop_encodeBetterBlockAsm1K: CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm1K MOVQ (DX)(BX*1), R8 MOVQ (DX)(R10*1), R11 SHLQ $0x10, R8 IMULQ DI, R8 SHRQ $0x35, R8 SHLQ $0x10, R11 IMULQ DI, R11 SHRQ $0x35, R11 MOVW BX, (SI)(R8*2) MOVW R9, (SI)(R11*2) ADDQ $0x02, BX ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm1K emit_remainder_encodeBetterBlockAsm1K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm1K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm1K: MOVQ src_len+32(FP), AX // emitLiteralsDstP MOVL 12(SP), BX CMPL BX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm1K MOVL AX, SI MOVL AX, 12(SP) LEAQ (DX)(BX*1), AX SUBL BX, SI // emitLiteral LEAL -1(SI), DX CMPL DX, $0x1d JB one_byte_emit_remainder_encodeBetterBlockAsm1K SUBL $0x1d, DX CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm1K JB three_bytes_emit_remainder_encodeBetterBlockAsm1K three_bytes_emit_remainder_encodeBetterBlockAsm1K: MOVB $0xf0, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX ADDL $0x1d, DX JMP memmove_long_emit_remainder_encodeBetterBlockAsm1K two_bytes_emit_remainder_encodeBetterBlockAsm1K: MOVB $0xe8, (CX) MOVB DL, 1(CX) ADDL $0x1d, DX ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_midemit_remainder_encodeBetterBlockAsm1K JMP memmove_long_emit_remainder_encodeBetterBlockAsm1K one_byte_emit_remainder_encodeBetterBlockAsm1K: SHLB $0x03, DL MOVB DL, (CX) ADDQ $0x01, CX LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -1, min move: 1 CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_3 CMPQ BX, $0x08 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm1K memmove_midemit_remainder_encodeBetterBlockAsm1K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort // margin: -2, min move: 30 CMPQ BX, $0x20 JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64 emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm1K emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm1K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm1K memmove_long_emit_remainder_encodeBetterBlockAsm1K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm1K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func emitLiteral(dst []byte, lit []byte) int // Requires: SSE2 TEXT ·emitLiteral(SB), NOSPLIT, $0-56 MOVQ lit_len+32(FP), DX MOVQ dst_base+0(FP), AX MOVQ lit_base+24(FP), CX TESTQ DX, DX JZ emit_literal_end_standalone_skip // emitLiteral MOVL DX, BX LEAL -1(DX), SI CMPL SI, $0x1d JB one_byte_standalone SUBL $0x1d, SI CMPL SI, $0x00000100 JB two_bytes_standalone CMPL SI, $0x00010000 JB three_bytes_standalone MOVL SI, DI SHRL $0x10, DI MOVB $0xf8, (AX) MOVW SI, 1(AX) MOVB DI, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX ADDL $0x1d, SI JMP memmove_long_standalone three_bytes_standalone: MOVB $0xf0, (AX) MOVW SI, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX ADDL $0x1d, SI JMP memmove_long_standalone two_bytes_standalone: MOVB $0xe8, (AX) MOVB SI, 1(AX) ADDL $0x1d, SI ADDQ $0x02, BX ADDQ $0x02, AX CMPL SI, $0x40 JB memmove_midstandalone JMP memmove_long_standalone one_byte_standalone: SHLB $0x03, SI MOVB SI, (AX) ADDQ $0x01, BX ADDQ $0x01, AX // genMemMoveShort // margin: 0, min move: 1 CMPQ DX, $0x03 JB emit_lit_memmove_standalone_memmove_move_1or2 JE emit_lit_memmove_standalone_memmove_move_3 CMPQ DX, $0x08 JBE emit_lit_memmove_standalone_memmove_move_4through8 CMPQ DX, $0x10 JBE emit_lit_memmove_standalone_memmove_move_8through16 CMPQ DX, $0x20 JBE emit_lit_memmove_standalone_memmove_move_17through32 JMP emit_lit_memmove_standalone_memmove_move_33through64 emit_lit_memmove_standalone_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(DX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_4through8: MOVL (CX), SI MOVL -4(CX)(DX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(DX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone memmove_midstandalone: // genMemMoveShort // margin: 0, min move: 30 CMPQ DX, $0x20 JBE emit_lit_memmove_mid_standalone_memmove_move_17through32 JMP emit_lit_memmove_mid_standalone_memmove_move_33through64 emit_lit_memmove_mid_standalone_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_mid_standalone_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone memmove_long_standalone: // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVQ DX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_standalonelarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_standalonelarge_big_loop_back emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ DX, R8 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone emit_literal_end_standalone_skip: XORQ BX, BX emit_literal_end_standalone: MOVQ BX, ret+48(FP) RET // func emitRepeat(dst []byte, length int) int TEXT ·emitRepeat(SB), NOSPLIT, $0-40 XORQ DX, DX MOVQ dst_base+0(FP), AX MOVQ length+24(FP), CX // emitRepeat LEAL -1(CX), BX CMPL CX, $0x1d JBE repeat_one_standalone LEAL -30(CX), BX CMPL CX, $0x0000011e JB repeat_two_standalone CMPL CX, $0x0001001e JB repeat_three_standalone MOVB $0xfc, (AX) MOVL BX, 1(AX) ADDQ $0x04, DX ADDQ $0x04, AX JMP gen_emit_repeat_end repeat_three_standalone: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, DX ADDQ $0x03, AX JMP gen_emit_repeat_end repeat_two_standalone: MOVB $0xec, (AX) MOVB BL, 1(AX) ADDQ $0x02, DX ADDQ $0x02, AX JMP gen_emit_repeat_end repeat_one_standalone: XORL BX, BX LEAL -4(BX)(CX*8), BX MOVB BL, (AX) ADDQ $0x01, DX ADDQ $0x01, AX gen_emit_repeat_end: MOVQ DX, ret+32(FP) RET // func emitCopy(dst []byte, offset int, length int) int TEXT ·emitCopy(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitCopy CMPL CX, $0x0001003f JBE two_byte_offset_standalone // emitCopy3 LEAL -4(DX), DX LEAL -65536(CX), CX SHLL $0x0b, CX ADDL $0x07, CX CMPL DX, $0x3c JBE emit_copy3_0_standalone_emit3 LEAL -60(DX), SI CMPL DX, $0x0000013c JB emit_copy3_1_standalone_emit3 CMPL DX, $0x0001003c JB emit_copy3_2_standalone_emit3 ADDL $0x000007e0, CX MOVL CX, (AX) MOVL SI, 4(AX) ADDQ $0x07, BX ADDQ $0x07, AX JMP gen_emit_copy_end emit_copy3_2_standalone_emit3: ADDL $0x000007c0, CX MOVL CX, (AX) MOVW SI, 4(AX) ADDQ $0x06, BX ADDQ $0x06, AX JMP gen_emit_copy_end emit_copy3_1_standalone_emit3: ADDL $0x000007a0, CX MOVL CX, (AX) MOVB SI, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end emit_copy3_0_standalone_emit3: SHLL $0x05, DX ORL DX, CX MOVL CX, (AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end two_byte_offset_standalone: CMPL CX, $0x00000400 JA two_byte_standalone CMPL DX, $0x00000013 JAE emit_one_longer_standalone LEAL -1(CX), CX SHLL $0x06, CX LEAL -15(CX)(DX*4), CX MOVW CX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end emit_one_longer_standalone: CMPL DX, $0x00000112 JAE emit_copy1_repeat_standalone LEAL -1(CX), CX SHLL $0x06, CX LEAL 61(CX), CX MOVW CX, (AX) LEAL -18(DX), CX MOVB CL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end emit_copy1_repeat_standalone: LEAL -1(CX), CX SHLL $0x06, CX LEAL 57(CX), CX MOVW CX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX SUBL $0x12, DX // emitRepeat LEAL -1(DX), CX CMPL DX, $0x1d JBE repeat_one_emit_copy1_do_repeat_standalone LEAL -30(DX), CX CMPL DX, $0x0000011e JB repeat_two_emit_copy1_do_repeat_standalone CMPL DX, $0x0001001e JB repeat_three_emit_copy1_do_repeat_standalone MOVB $0xfc, (AX) MOVL CX, 1(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_emit_copy1_do_repeat_standalone: MOVB $0xf4, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_emit_copy1_do_repeat_standalone: MOVB $0xec, (AX) MOVB CL, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_one_emit_copy1_do_repeat_standalone: XORL CX, CX LEAL -4(CX)(DX*8), CX MOVB CL, (AX) ADDQ $0x01, BX ADDQ $0x01, AX JMP gen_emit_copy_end two_byte_standalone: // emitCopy2 LEAL -64(CX), CX LEAL -4(DX), DX MOVW CX, 1(AX) CMPL DX, $0x3c JBE emit_copy2_0_standalone_emit2 LEAL -60(DX), CX CMPL DX, $0x0000013c JB emit_copy2_1_standalone_emit2 CMPL DX, $0x0001003c JB emit_copy2_2_standalone_emit2 MOVB $0xfe, (AX) MOVL CX, 3(AX) ADDQ $0x06, BX ADDQ $0x06, AX JMP gen_emit_copy_end emit_copy2_2_standalone_emit2: MOVB $0xfa, (AX) MOVW CX, 3(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end emit_copy2_1_standalone_emit2: MOVB $0xf6, (AX) MOVB CL, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end emit_copy2_0_standalone_emit2: MOVL $0x00000002, CX LEAL (CX)(DX*4), CX MOVB CL, (AX) ADDQ $0x03, BX ADDQ $0x03, AX gen_emit_copy_end: MOVQ BX, ret+40(FP) RET // func emitCopyLits2(dst []byte, lits []byte, offset int, length int) int // Requires: CMOV TEXT ·emitCopyLits2(SB), NOSPLIT, $0-72 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ lits_len+32(FP), SI MOVQ offset+48(FP), CX MOVQ length+56(FP), DX CMPL DX, $0x0b // emitCopy2WithLits XORQ DI, DI SUBL $0x40, CX LEAL -11(DX), R8 LEAL -4(DX), DX MOVW CX, 1(AX) CMPL DX, $0x07 CMOVLGE R8, DI MOVQ $0x00000007, CX CMOVLLT DX, CX LEAL -1(SI)(CX*4), CX MOVL $0x00000003, DX LEAL (DX)(CX*8), CX MOVB CL, (AX) ADDQ $0x03, BX ADDQ $0x03, AX MOVQ lits_base+24(FP), CX // genMemMoveVeryShort CMPQ SI, $0x03 JE standalone_emitcopy2_lits_move_3 JA standalone_emitcopy2_lits_move_4 MOVB (CX), DL MOVB -1(CX)(SI*1), CL MOVB DL, (AX) MOVB CL, -1(AX)(SI*1) JMP standalone_emitcopy2_lits_end standalone_emitcopy2_lits_move_3: MOVW (CX), DX MOVB 2(CX), CL MOVW DX, (AX) MOVB CL, 2(AX) JMP standalone_emitcopy2_lits_end standalone_emitcopy2_lits_move_4: MOVL (CX), DX MOVL DX, (AX) standalone_emitcopy2_lits_end: ADDQ SI, BX ADDQ SI, AX TESTL DI, DI JZ standalone_emitcopy2_lits_done // emitRepeat LEAL -1(DI), CX CMPL DI, $0x1d JBE repeat_one_standalone_emitcopy2_lits LEAL -30(DI), CX CMPL DI, $0x0000011e JB repeat_two_standalone_emitcopy2_lits CMPL DI, $0x0001001e JB repeat_three_standalone_emitcopy2_lits MOVB $0xfc, (AX) MOVL CX, 1(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP standalone_emitcopy2_lits_done repeat_three_standalone_emitcopy2_lits: MOVB $0xf4, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP standalone_emitcopy2_lits_done repeat_two_standalone_emitcopy2_lits: MOVB $0xec, (AX) MOVB CL, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP standalone_emitcopy2_lits_done repeat_one_standalone_emitcopy2_lits: XORL CX, CX LEAL -4(CX)(DI*8), CX MOVB CL, (AX) ADDQ $0x01, BX ADDQ $0x01, AX standalone_emitcopy2_lits_done: MOVQ BX, ret+64(FP) RET // func emitCopyLits3(dst []byte, lits []byte, offset int, length int) int TEXT ·emitCopyLits3(SB), NOSPLIT, $0-72 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ lits_len+32(FP), SI MOVQ offset+48(FP), CX MOVQ length+56(FP), DX // emitCopy3 LEAL -4(DX), DX LEAL -65536(CX), CX SHLL $0x0b, CX LEAL 7(CX)(SI*8), CX CMPL DX, $0x3c JBE emit_copy3_0_standalone_lits LEAL -60(DX), DI CMPL DX, $0x0000013c JB emit_copy3_1_standalone_lits CMPL DX, $0x0001003c JB emit_copy3_2_standalone_lits ADDL $0x000007e0, CX MOVL CX, (AX) MOVL DI, 4(AX) ADDQ $0x07, BX ADDQ $0x07, AX JMP gen_emit_copy_lits_copylits emit_copy3_2_standalone_lits: ADDL $0x000007c0, CX MOVL CX, (AX) MOVW DI, 4(AX) ADDQ $0x06, BX ADDQ $0x06, AX JMP gen_emit_copy_lits_copylits emit_copy3_1_standalone_lits: ADDL $0x000007a0, CX MOVL CX, (AX) MOVB DI, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_lits_copylits emit_copy3_0_standalone_lits: SHLL $0x05, DX ORL DX, CX MOVL CX, (AX) ADDQ $0x04, BX ADDQ $0x04, AX gen_emit_copy_lits_copylits: MOVQ lits_base+24(FP), CX // genMemMoveVeryShort CMPQ SI, $0x03 JE standalone_emitcopy3_lits_move_3 MOVB (CX), DL MOVB -1(CX)(SI*1), CL MOVB DL, (AX) MOVB CL, -1(AX)(SI*1) JMP standalone_emitcopy3_lits_end standalone_emitcopy3_lits_move_3: MOVW (CX), DX MOVB 2(CX), CL MOVW DX, (AX) MOVB CL, 2(AX) standalone_emitcopy3_lits_end: ADDQ SI, BX MOVQ BX, ret+64(FP) RET // func matchLen(a []byte, b []byte) int // Requires: BMI TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX MOVQ a_len+8(FP), DX // matchLen XORL SI, SI JMP matchlen_loop_16_entry_standalone matchlen_loopback_16_standalone: MOVQ (AX)(SI*1), BX MOVQ 8(AX)(SI*1), DI XORQ (CX)(SI*1), BX JNZ matchlen_bsf_8_standalone XORQ 8(CX)(SI*1), DI JNZ matchlen_bsf_16standalone LEAL -16(DX), DX LEAL 16(SI), SI matchlen_loop_16_entry_standalone: CMPL DX, $0x10 JAE matchlen_loopback_16_standalone JMP matchlen_match8_standalone matchlen_bsf_16standalone: #ifdef GOAMD64_v3 TZCNTQ DI, DI #else BSFQ DI, DI #endif SARQ $0x03, DI LEAL 8(SI)(DI*1), SI JMP gen_match_len_end matchlen_match8_standalone: CMPL DX, $0x08 JB matchlen_match4_standalone MOVQ (AX)(SI*1), BX XORQ (CX)(SI*1), BX JNZ matchlen_bsf_8_standalone LEAL -8(DX), DX LEAL 8(SI), SI JMP matchlen_match4_standalone matchlen_bsf_8_standalone: #ifdef GOAMD64_v3 TZCNTQ BX, BX #else BSFQ BX, BX #endif SARQ $0x03, BX LEAL (SI)(BX*1), SI JMP gen_match_len_end matchlen_match4_standalone: CMPL DX, $0x04 JB matchlen_match2_standalone MOVL (AX)(SI*1), BX CMPL (CX)(SI*1), BX JNE matchlen_match2_standalone LEAL -4(DX), DX LEAL 4(SI), SI matchlen_match2_standalone: CMPL DX, $0x01 JE matchlen_match1_standalone JB gen_match_len_end MOVW (AX)(SI*1), BX CMPW (CX)(SI*1), BX JNE matchlen_match1_standalone LEAL 2(SI), SI SUBL $0x02, DX JZ gen_match_len_end matchlen_match1_standalone: MOVB (AX)(SI*1), BL CMPB (CX)(SI*1), BL JNE gen_match_len_end LEAL 1(SI), SI gen_match_len_end: MOVQ SI, ret+48(FP) RET // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: CMOV, SSE2 TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $8-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -12(AX)(CX*1), CX MOVL $0x00000001, (SP) lz4_mz_loop: CMPQ DX, BX JAE lz4_mz_corrupt CMPQ AX, CX JAE lz4_mz_dstfull MOVBQZX (DX), DI MOVQ DI, R8 MOVQ DI, R9 ANDQ $0x0f, R9 XORQ R10, R10 SHRQ $0x04, R8 CMPQ DI, $0x50 CMOVQLT R8, R10 JLT lz4_mz_ll_end CMPQ DI, $0xf0 JB lz4_mz_ll_end lz4_mz_ll_loop: INCQ DX CMPQ DX, BX JAE lz4_mz_corrupt MOVBQZX (DX), DI ADDQ DI, R8 CMPQ DI, $0xff JEQ lz4_mz_ll_loop lz4_mz_ll_end: LEAQ (DX)(R8*1), DI ADDQ $0x04, R9 CMPQ DI, BX JAE lz4_mz_corrupt INCQ DX INCQ DI TESTQ R8, R8 JZ lz4_mz_lits_done TESTQ R10, R10 JNZ lz4_mz_lits_done LEAQ (AX)(R8*1), R11 CMPQ R11, CX JAE lz4_mz_dstfull // emitLiteral LEAL -1(R8), R11 CMPL R11, $0x1d JB one_byte_lz4_mz SUBL $0x1d, R11 CMPL R11, $0x00000100 JB two_bytes_lz4_mz CMPL R11, $0x00010000 JB three_bytes_lz4_mz MOVL R11, R12 SHRL $0x10, R12 MOVB $0xf8, (AX) MOVW R11, 1(AX) MOVB R12, 3(AX) ADDQ $0x04, AX ADDL $0x1d, R11 JMP memmove_long_lz4_mz three_bytes_lz4_mz: MOVB $0xf0, (AX) MOVW R11, 1(AX) ADDQ $0x03, AX ADDL $0x1d, R11 JMP memmove_long_lz4_mz two_bytes_lz4_mz: MOVB $0xe8, (AX) MOVB R11, 1(AX) ADDL $0x1d, R11 ADDQ $0x02, AX CMPL R11, $0x40 JB memmove_midlz4_mz JMP memmove_long_lz4_mz one_byte_lz4_mz: SHLB $0x03, R11 MOVB R11, (AX) ADDQ $0x01, AX LEAQ (AX)(R8*1), R11 MOVL R8, R12 // genMemMoveShort // margin: 0, min move: 1 CMPQ R12, $0x03 JB emit_lit_memmove_lz4_mz_memmove_move_1or2 JE emit_lit_memmove_lz4_mz_memmove_move_3 CMPQ R12, $0x08 JBE emit_lit_memmove_lz4_mz_memmove_move_4through8 CMPQ R12, $0x10 JBE emit_lit_memmove_lz4_mz_memmove_move_8through16 CMPQ R12, $0x20 JBE emit_lit_memmove_lz4_mz_memmove_move_17through32 JMP emit_lit_memmove_lz4_mz_memmove_move_33through64 emit_lit_memmove_lz4_mz_memmove_move_1or2: MOVB (DX), R13 MOVB -1(DX)(R12*1), R14 MOVB R13, (AX) MOVB R14, -1(AX)(R12*1) JMP memmove_end_copy_lz4_mz emit_lit_memmove_lz4_mz_memmove_move_3: MOVW (DX), R13 MOVB 2(DX), R14 MOVW R13, (AX) MOVB R14, 2(AX) JMP memmove_end_copy_lz4_mz emit_lit_memmove_lz4_mz_memmove_move_4through8: MOVL (DX), R13 MOVL -4(DX)(R12*1), R14 MOVL R13, (AX) MOVL R14, -4(AX)(R12*1) JMP memmove_end_copy_lz4_mz emit_lit_memmove_lz4_mz_memmove_move_8through16: MOVQ (DX), R13 MOVQ -8(DX)(R12*1), R14 MOVQ R13, (AX) MOVQ R14, -8(AX)(R12*1) JMP memmove_end_copy_lz4_mz emit_lit_memmove_lz4_mz_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R12*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R12*1) JMP memmove_end_copy_lz4_mz emit_lit_memmove_lz4_mz_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R12*1), X2 MOVOU -16(DX)(R12*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R12*1) MOVOU X3, -16(AX)(R12*1) memmove_end_copy_lz4_mz: MOVQ R11, AX JMP lz4_mz_lits_emit_done memmove_midlz4_mz: LEAQ (AX)(R8*1), R11 MOVL R8, R12 // genMemMoveShort // margin: 0, min move: 30 CMPQ R12, $0x20 JBE emit_lit_memmove_mid_lz4_mz_memmove_move_17through32 JMP emit_lit_memmove_mid_lz4_mz_memmove_move_33through64 emit_lit_memmove_mid_lz4_mz_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R12*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R12*1) JMP memmove_mid_end_copy_lz4_mz emit_lit_memmove_mid_lz4_mz_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R12*1), X2 MOVOU -16(DX)(R12*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R12*1) MOVOU X3, -16(AX)(R12*1) memmove_mid_end_copy_lz4_mz: MOVQ R11, AX JMP lz4_mz_lits_emit_done memmove_long_lz4_mz: LEAQ (AX)(R8*1), R11 MOVL R8, R12 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R12*1), X2 MOVOU -16(DX)(R12*1), X3 MOVQ R12, R14 SHRQ $0x05, R14 MOVQ AX, R13 ANDL $0x0000001f, R13 MOVQ $0x00000040, R15 SUBQ R13, R15 DECQ R14 JA emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32 LEAQ -32(DX)(R15*1), R13 LEAQ -32(AX)(R15*1), BP emit_lit_memmove_long_lz4_mzlarge_big_loop_back: MOVOU (R13), X4 MOVOU 16(R13), X5 MOVOA X4, (BP) MOVOA X5, 16(BP) ADDQ $0x20, BP ADDQ $0x20, R13 ADDQ $0x20, R15 DECQ R14 JNA emit_lit_memmove_long_lz4_mzlarge_big_loop_back emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32: MOVOU -32(DX)(R15*1), X4 MOVOU -16(DX)(R15*1), X5 MOVOA X4, -32(AX)(R15*1) MOVOA X5, -16(AX)(R15*1) ADDQ $0x20, R15 CMPQ R12, R15 JAE emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R12*1) MOVOU X3, -16(AX)(R12*1) MOVQ R11, AX lz4_mz_lits_emit_done: lz4_mz_lits_done: ADDQ R8, SI MOVQ DI, R8 MOVQ DX, DI MOVQ R8, DX CMPQ DX, BX JNE lz4_mz_match CMPQ R9, $0x04 JNE lz4_mz_corrupt TESTQ R10, R10 JNZ lz4_mz_emit_final JMP lz4_mz_done lz4_mz_match: ADDQ $0x02, DX CMPQ DX, BX JAE lz4_mz_corrupt MOVWQZX -2(DX), R8 TESTQ R8, R8 JZ lz4_mz_corrupt CMPQ R8, SI JA lz4_mz_corrupt CMPQ R9, $0x13 JNE lz4_mz_ml_done lz4_mz_ml_loop: MOVBQZX (DX), R11 INCQ DX ADDQ R11, R9 CMPQ DX, BX JAE lz4_mz_corrupt CMPQ R11, $0xff JEQ lz4_mz_ml_loop lz4_mz_ml_done: ADDQ R9, SI TESTQ R10, R10 JNZ lz4_mz_dofuse CMPQ (SP), R8 JNE lz4_mz_docopy // emitRepeat LEAL -1(R9), DI CMPL R9, $0x1d JBE repeat_one_lz4_mz LEAL -30(R9), DI CMPL R9, $0x0000011e JB repeat_two_lz4_mz CMPL R9, $0x0001001e JB repeat_three_lz4_mz MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x04, AX JMP lz4_mz_loop repeat_three_lz4_mz: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP lz4_mz_loop repeat_two_lz4_mz: MOVB $0xec, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX JMP lz4_mz_loop repeat_one_lz4_mz: XORL DI, DI LEAL -4(DI)(R9*8), DI MOVB DI, (AX) ADDQ $0x01, AX JMP lz4_mz_loop lz4_mz_dofuse: MOVQ R8, (SP) CMPQ R8, $0x40 JB lz4_mz_doemitcopy // emitCopy2WithLits XORQ R11, R11 SUBL $0x40, R8 LEAL -11(R9), R12 LEAL -4(R9), R9 MOVW R8, 1(AX) CMPL R9, $0x07 CMOVLGE R12, R11 MOVQ $0x00000007, R8 CMOVLLT R9, R8 LEAL -1(R10)(R8*4), R8 MOVL $0x00000003, R9 LEAL (R9)(R8*8), R8 MOVB R8, (AX) ADDQ $0x03, AX MOVL (DI), DI MOVL DI, (AX) ADDQ R10, AX TESTL R11, R11 JZ lz4_mz_loop // emitRepeat LEAL -1(R11), DI CMPL R11, $0x1d JBE repeat_one_fused_emitrep_lz4_mz_ LEAL -30(R11), DI CMPL R11, $0x0000011e JB repeat_two_fused_emitrep_lz4_mz_ CMPL R11, $0x0001001e JB repeat_three_fused_emitrep_lz4_mz_ MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x04, AX JMP lz4_mz_loop repeat_three_fused_emitrep_lz4_mz_: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP lz4_mz_loop repeat_two_fused_emitrep_lz4_mz_: MOVB $0xec, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX JMP lz4_mz_loop repeat_one_fused_emitrep_lz4_mz_: XORL DI, DI LEAL -4(DI)(R11*8), DI MOVB DI, (AX) ADDQ $0x01, AX JMP lz4_mz_loop lz4_mz_doemitcopy: // emitLiteral LEAL -1(R10), R11 CMPL R11, $0x1d JB one_byte_lz4_mz_emitcopy SUBL $0x1d, R11 CMPL R11, $0x00000100 JB two_bytes_lz4_mz_emitcopy CMPL R11, $0x00010000 JB three_bytes_lz4_mz_emitcopy MOVL R11, R12 SHRL $0x10, R12 MOVB $0xf8, (AX) MOVW R11, 1(AX) MOVB R12, 3(AX) ADDQ $0x04, AX ADDL $0x1d, R11 JMP memmove_long_lz4_mz_emitcopy three_bytes_lz4_mz_emitcopy: MOVB $0xf0, (AX) MOVW R11, 1(AX) ADDQ $0x03, AX ADDL $0x1d, R11 JMP memmove_long_lz4_mz_emitcopy two_bytes_lz4_mz_emitcopy: MOVB $0xe8, (AX) MOVB R11, 1(AX) ADDL $0x1d, R11 ADDQ $0x02, AX CMPL R11, $0x40 JB memmove_midlz4_mz_emitcopy JMP memmove_long_lz4_mz_emitcopy one_byte_lz4_mz_emitcopy: SHLB $0x03, R11 MOVB R11, (AX) ADDQ $0x01, AX LEAQ (AX)(R10*1), R11 // genMemMoveShort // margin: 0, min move: 1 CMPQ R10, $0x03 JB emit_lit_memmove_lz4_mz_emitcopy_memmove_move_1or2 JE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_3 CMPQ R10, $0x08 JBE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8 CMPQ R10, $0x10 JBE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16 CMPQ R10, $0x20 JBE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_17through32 JMP emit_lit_memmove_lz4_mz_emitcopy_memmove_move_33through64 emit_lit_memmove_lz4_mz_emitcopy_memmove_move_1or2: MOVB (DI), R12 MOVB -1(DI)(R10*1), DI MOVB R12, (AX) MOVB DI, -1(AX)(R10*1) JMP memmove_end_copy_lz4_mz_emitcopy emit_lit_memmove_lz4_mz_emitcopy_memmove_move_3: MOVW (DI), R12 MOVB 2(DI), DI MOVW R12, (AX) MOVB DI, 2(AX) JMP memmove_end_copy_lz4_mz_emitcopy emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8: MOVL (DI), R12 MOVL -4(DI)(R10*1), DI MOVL R12, (AX) MOVL DI, -4(AX)(R10*1) JMP memmove_end_copy_lz4_mz_emitcopy emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16: MOVQ (DI), R12 MOVQ -8(DI)(R10*1), DI MOVQ R12, (AX) MOVQ DI, -8(AX)(R10*1) JMP memmove_end_copy_lz4_mz_emitcopy emit_lit_memmove_lz4_mz_emitcopy_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R10*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R10*1) JMP memmove_end_copy_lz4_mz_emitcopy emit_lit_memmove_lz4_mz_emitcopy_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R10*1), X2 MOVOU -16(DI)(R10*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R10*1) MOVOU X3, -16(AX)(R10*1) memmove_end_copy_lz4_mz_emitcopy: MOVQ R11, AX JMP lz4_mz__emit_done memmove_midlz4_mz_emitcopy: LEAQ (AX)(R10*1), R11 // genMemMoveShort // margin: 0, min move: 30 CMPQ R10, $0x20 JBE emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_17through32 JMP emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_33through64 emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R10*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R10*1) JMP memmove_mid_end_copy_lz4_mz_emitcopy emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R10*1), X2 MOVOU -16(DI)(R10*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R10*1) MOVOU X3, -16(AX)(R10*1) memmove_mid_end_copy_lz4_mz_emitcopy: MOVQ R11, AX JMP lz4_mz__emit_done memmove_long_lz4_mz_emitcopy: LEAQ (AX)(R10*1), R11 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R10*1), X2 MOVOU -16(DI)(R10*1), X3 MOVQ R10, R13 SHRQ $0x05, R13 MOVQ AX, R12 ANDL $0x0000001f, R12 MOVQ $0x00000040, R14 SUBQ R12, R14 DECQ R13 JA emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32 LEAQ -32(DI)(R14*1), R12 LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back: MOVOU (R12), X4 MOVOU 16(R12), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R12 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32: MOVOU -32(DI)(R14*1), X4 MOVOU -16(DI)(R14*1), X5 MOVOA X4, -32(AX)(R14*1) MOVOA X5, -16(AX)(R14*1) ADDQ $0x20, R14 CMPQ R10, R14 JAE emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R10*1) MOVOU X3, -16(AX)(R10*1) MOVQ R11, AX lz4_mz__emit_done: // emitCopy CMPL R8, $0x00000400 JA two_byte_lz4_mz__lz4_mz_short_ CMPL R9, $0x00000013 JAE emit_one_longer_lz4_mz__lz4_mz_short_ LEAL -1(R8), DI SHLL $0x06, DI LEAL -15(DI)(R9*4), DI MOVW DI, (AX) ADDQ $0x02, AX JMP lz4_mz_loop emit_one_longer_lz4_mz__lz4_mz_short_: CMPL R9, $0x00000112 JAE emit_copy1_repeat_lz4_mz__lz4_mz_short_ LEAL -1(R8), DI SHLL $0x06, DI LEAL 61(DI), DI MOVW DI, (AX) LEAL -18(R9), DI MOVB DI, 2(AX) ADDQ $0x03, AX JMP lz4_mz_loop emit_copy1_repeat_lz4_mz__lz4_mz_short_: LEAL -1(R8), DI SHLL $0x06, DI LEAL 57(DI), DI MOVW DI, (AX) ADDQ $0x02, AX SUBL $0x12, R9 // emitRepeat LEAL -1(R9), DI CMPL R9, $0x1d JBE repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_ LEAL -30(R9), DI CMPL R9, $0x0000011e JB repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_ CMPL R9, $0x0001001e JB repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_ MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x04, AX JMP lz4_mz_loop repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP lz4_mz_loop repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_: MOVB $0xec, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX JMP lz4_mz_loop repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_: XORL DI, DI LEAL -4(DI)(R9*8), DI MOVB DI, (AX) ADDQ $0x01, AX JMP lz4_mz_loop two_byte_lz4_mz__lz4_mz_short_: // emitCopy2 LEAL -64(R8), R8 LEAL -4(R9), R9 MOVW R8, 1(AX) CMPL R9, $0x3c JBE emit_copy2_0_lz4_mz__lz4_mz_short__emit2 LEAL -60(R9), DI CMPL R9, $0x0000013c JB emit_copy2_1_lz4_mz__lz4_mz_short__emit2 CMPL R9, $0x0001003c JB emit_copy2_2_lz4_mz__lz4_mz_short__emit2 MOVB $0xfe, (AX) MOVL DI, 3(AX) ADDQ $0x06, AX JMP lz4_mz_loop emit_copy2_2_lz4_mz__lz4_mz_short__emit2: MOVB $0xfa, (AX) MOVW DI, 3(AX) ADDQ $0x05, AX JMP lz4_mz_loop emit_copy2_1_lz4_mz__lz4_mz_short__emit2: MOVB $0xf6, (AX) MOVB DI, 3(AX) ADDQ $0x04, AX JMP lz4_mz_loop emit_copy2_0_lz4_mz__lz4_mz_short__emit2: MOVL $0x00000002, DI LEAL (DI)(R9*4), DI MOVB DI, (AX) ADDQ $0x03, AX JMP lz4_mz_loop lz4_mz_docopy: MOVQ R8, (SP) // emitCopy CMPL R8, $0x00000400 JA two_byte_lz4_mz__lz4_mz CMPL R9, $0x00000013 JAE emit_one_longer_lz4_mz__lz4_mz LEAL -1(R8), DI SHLL $0x06, DI LEAL -15(DI)(R9*4), DI MOVW DI, (AX) ADDQ $0x02, AX JMP lz4_mz_loop emit_one_longer_lz4_mz__lz4_mz: CMPL R9, $0x00000112 JAE emit_copy1_repeat_lz4_mz__lz4_mz LEAL -1(R8), DI SHLL $0x06, DI LEAL 61(DI), DI MOVW DI, (AX) LEAL -18(R9), DI MOVB DI, 2(AX) ADDQ $0x03, AX JMP lz4_mz_loop emit_copy1_repeat_lz4_mz__lz4_mz: LEAL -1(R8), DI SHLL $0x06, DI LEAL 57(DI), DI MOVW DI, (AX) ADDQ $0x02, AX SUBL $0x12, R9 // emitRepeat LEAL -1(R9), DI CMPL R9, $0x1d JBE repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz LEAL -30(R9), DI CMPL R9, $0x0000011e JB repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz CMPL R9, $0x0001001e JB repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x04, AX JMP lz4_mz_loop repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP lz4_mz_loop repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz: MOVB $0xec, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX JMP lz4_mz_loop repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz: XORL DI, DI LEAL -4(DI)(R9*8), DI MOVB DI, (AX) ADDQ $0x01, AX JMP lz4_mz_loop two_byte_lz4_mz__lz4_mz: // emitCopy2 LEAL -64(R8), R8 LEAL -4(R9), R9 MOVW R8, 1(AX) CMPL R9, $0x3c JBE emit_copy2_0_lz4_mz__lz4_mz_emit2 LEAL -60(R9), DI CMPL R9, $0x0000013c JB emit_copy2_1_lz4_mz__lz4_mz_emit2 CMPL R9, $0x0001003c JB emit_copy2_2_lz4_mz__lz4_mz_emit2 MOVB $0xfe, (AX) MOVL DI, 3(AX) ADDQ $0x06, AX JMP lz4_mz_loop emit_copy2_2_lz4_mz__lz4_mz_emit2: MOVB $0xfa, (AX) MOVW DI, 3(AX) ADDQ $0x05, AX JMP lz4_mz_loop emit_copy2_1_lz4_mz__lz4_mz_emit2: MOVB $0xf6, (AX) MOVB DI, 3(AX) ADDQ $0x04, AX JMP lz4_mz_loop emit_copy2_0_lz4_mz__lz4_mz_emit2: MOVL $0x00000002, DI LEAL (DI)(R9*4), DI MOVB DI, (AX) ADDQ $0x03, AX JMP lz4_mz_loop lz4_mz_emit_final: // emitLiteral LEAL -1(R10), CX CMPL CX, $0x1d JB one_byte_lz4_mz_emit_final SUBL $0x1d, CX CMPL CX, $0x00000100 JB two_bytes_lz4_mz_emit_final CMPL CX, $0x00010000 JB three_bytes_lz4_mz_emit_final MOVL CX, DX SHRL $0x10, DX MOVB $0xf8, (AX) MOVW CX, 1(AX) MOVB DL, 3(AX) ADDQ $0x04, AX ADDL $0x1d, CX JMP memmove_long_lz4_mz_emit_final three_bytes_lz4_mz_emit_final: MOVB $0xf0, (AX) MOVW CX, 1(AX) ADDQ $0x03, AX ADDL $0x1d, CX JMP memmove_long_lz4_mz_emit_final two_bytes_lz4_mz_emit_final: MOVB $0xe8, (AX) MOVB CL, 1(AX) ADDL $0x1d, CX ADDQ $0x02, AX CMPL CX, $0x40 JB memmove_midlz4_mz_emit_final JMP memmove_long_lz4_mz_emit_final one_byte_lz4_mz_emit_final: SHLB $0x03, CL MOVB CL, (AX) ADDQ $0x01, AX LEAQ (AX)(R10*1), CX MOVL R10, DX // genMemMoveShort // margin: 0, min move: 1 CMPQ DX, $0x03 JB emit_lit_memmove_lz4_mz_emit_final_memmove_move_1or2 JE emit_lit_memmove_lz4_mz_emit_final_memmove_move_3 CMPQ DX, $0x08 JBE emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8 CMPQ DX, $0x10 JBE emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16 CMPQ DX, $0x20 JBE emit_lit_memmove_lz4_mz_emit_final_memmove_move_17through32 JMP emit_lit_memmove_lz4_mz_emit_final_memmove_move_33through64 emit_lit_memmove_lz4_mz_emit_final_memmove_move_1or2: MOVB (DI), BL MOVB -1(DI)(DX*1), DI MOVB BL, (AX) MOVB DI, -1(AX)(DX*1) JMP memmove_end_copy_lz4_mz_emit_final emit_lit_memmove_lz4_mz_emit_final_memmove_move_3: MOVW (DI), BX MOVB 2(DI), DI MOVW BX, (AX) MOVB DI, 2(AX) JMP memmove_end_copy_lz4_mz_emit_final emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8: MOVL (DI), BX MOVL -4(DI)(DX*1), DI MOVL BX, (AX) MOVL DI, -4(AX)(DX*1) JMP memmove_end_copy_lz4_mz_emit_final emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16: MOVQ (DI), BX MOVQ -8(DI)(DX*1), DI MOVQ BX, (AX) MOVQ DI, -8(AX)(DX*1) JMP memmove_end_copy_lz4_mz_emit_final emit_lit_memmove_lz4_mz_emit_final_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) JMP memmove_end_copy_lz4_mz_emit_final emit_lit_memmove_lz4_mz_emit_final_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(DX*1), X2 MOVOU -16(DI)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) memmove_end_copy_lz4_mz_emit_final: MOVQ CX, AX JMP lz4_mz_done memmove_midlz4_mz_emit_final: LEAQ (AX)(R10*1), CX MOVL R10, DX // genMemMoveShort // margin: 0, min move: 30 CMPQ DX, $0x20 JBE emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_17through32 JMP emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_33through64 emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) JMP memmove_mid_end_copy_lz4_mz_emit_final emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(DX*1), X2 MOVOU -16(DI)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) memmove_mid_end_copy_lz4_mz_emit_final: MOVQ CX, AX JMP lz4_mz_done memmove_long_lz4_mz_emit_final: LEAQ (AX)(R10*1), CX MOVL R10, DX // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(DX*1), X2 MOVOU -16(DI)(DX*1), X3 MOVQ DX, R8 SHRQ $0x05, R8 MOVQ AX, BX ANDL $0x0000001f, BX MOVQ $0x00000040, R9 SUBQ BX, R9 DECQ R8 JA emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32 LEAQ -32(DI)(R9*1), BX LEAQ -32(AX)(R9*1), R10 emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back: MOVOU (BX), X4 MOVOU 16(BX), X5 MOVOA X4, (R10) MOVOA X5, 16(R10) ADDQ $0x20, R10 ADDQ $0x20, BX ADDQ $0x20, R9 DECQ R8 JNA emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32: MOVOU -32(DI)(R9*1), X4 MOVOU -16(DI)(R9*1), X5 MOVOA X4, -32(AX)(R9*1) MOVOA X5, -16(AX)(R9*1) ADDQ $0x20, R9 CMPQ DX, R9 JAE emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) MOVQ CX, AX lz4_mz_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4_mz_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4_mz_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET // func decodeBlockAsm(dst []byte, src []byte) int // Requires: CMOV, SSE2 TEXT ·decodeBlockAsm(SB), $8-56 MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX MOVQ AX, SI XORQ DI, DI MOVQ DX, R8 MOVQ $0x00000001, R9 LEAQ (AX)(CX*1), AX LEAQ (DX)(BX*1), CX LEAQ -20(CX), DX LEAQ -20(AX), BX CMPQ R8, DX JAE decodeBlockAsm_fast_end_copy MOVBQZX (R8), R10 MOVQ R10, R11 SHRQ $0x02, R11 decodeBlockAsm_fast_loop_nofetch: CMPQ SI, BX JAE decodeBlockAsm_fast_end_copy ANDQ $0x03, R10 JNZ decodeBlockAsm_fast_copy decodeBlockAsm_fast_lits: MOVL R11, R12 SHRL $0x01, R12 CMPL R12, $0x1d JB decodeBlockAsm_fast_lit_0 JEQ decodeBlockAsm_fast_lit_1 CMPL R12, $0x1e JEQ decodeBlockAsm_fast_lit_2 JMP decodeBlockAsm_fast_lit_3 decodeBlockAsm_fast_lit_0: INCQ R8 INCL R12 LEAQ (SI)(R12*1), R10 CMPQ R10, AX JA corrupt BTL $0x00, R11 JC decodeBlockAsm_fast_copy_exec_short LEAQ (R8)(R12*1), R10 CMPQ R10, CX JA corrupt // genMemMoveShort // margin: 19, min move: 1 CMPQ R12, $0x10 JBE decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16 CMPQ R12, $0x20 JBE decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32 JMP decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64 decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16: MOVOU (R8), X0 MOVOU X0, (SI) JMP decodeBlockAsm_fast_litcopy_done decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(R12*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(R12*1) JMP decodeBlockAsm_fast_litcopy_done decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(R12*1), X2 MOVOU -16(R8)(R12*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) JMP decodeBlockAsm_fast_litcopy_done decodeBlockAsm_fast_lit_1: MOVBQZX 1(R8), R12 ADDQ $0x02, R8 JMP decodeBlockAsm_fast_litcopy_long decodeBlockAsm_fast_lit_2: MOVWQZX 1(R8), R12 ADDQ $0x03, R8 JMP decodeBlockAsm_fast_litcopy_long decodeBlockAsm_fast_lit_3: MOVL (R8), R12 ADDQ $0x04, R8 SHRL $0x08, R12 decodeBlockAsm_fast_litcopy_long: LEAQ 30(R12), R12 LEAQ (SI)(R12*1), R10 CMPQ R10, AX JA corrupt BTL $0x00, R11 JC decodeBlockAsm_fast_copy_exec LEAQ (R8)(R12*1), R10 CMPQ R10, CX JA corrupt CMPL R12, $0x40 JBE decodeBlockAsm_fast_litcopy_short_reduced // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(R12*1), X2 MOVOU -16(R8)(R12*1), X3 MOVQ R12, R11 SHRQ $0x05, R11 MOVQ SI, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R11 JA decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32 LEAQ -32(R8)(R13*1), R10 LEAQ -32(SI)(R13*1), R14 decodeBlockAsm_fast_litcopy_longlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R11 JNA decodeBlockAsm_fast_litcopy_longlarge_big_loop_back decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32: MOVOU -32(R8)(R13*1), X4 MOVOU -16(R8)(R13*1), X5 MOVOA X4, -32(SI)(R13*1) MOVOA X5, -16(SI)(R13*1) ADDQ $0x20, R13 CMPQ R12, R13 JAE decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) JMP decodeBlockAsm_fast_litcopy_done decodeBlockAsm_fast_litcopy_short_reduced: // genMemMoveShort // margin: 16, min move: 30 CMPQ R12, $0x20 JBE decodeBlockAsm_fast_lit_longer_copy_memmove_move_17through32 JMP decodeBlockAsm_fast_lit_longer_copy_memmove_move_33through64 decodeBlockAsm_fast_lit_longer_copy_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(R12*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(R12*1) JMP decodeBlockAsm_fast_litcopy_done decodeBlockAsm_fast_lit_longer_copy_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(R12*1), X2 MOVOU -16(R8)(R12*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) decodeBlockAsm_fast_litcopy_done: ADDQ R12, R8 ADDQ R12, SI ADDQ R12, DI CMPQ R8, DX JAE decodeBlockAsm_fast_end_done MOVBQZX (R8), R10 MOVQ R10, R11 SHRQ $0x02, R11 CMPQ SI, BX JAE decodeBlockAsm_fast_end_done ANDQ $0x03, R10 JZ decodeBlockAsm_fast_lits decodeBlockAsm_fast_copy: MOVL (R8), R13 CMPL R10, $0x02 JB decodeBlockAsm_fast_copy_1 JEQ decodeBlockAsm_fast_copy_2 JMP decodeBlockAsm_fast_copy_3 decodeBlockAsm_fast_copy_1: MOVWQZX R13, R9 ADDQ $0x02, R8 MOVQ R11, R12 ANDL $0x0f, R12 SHRL $0x06, R9 INCL R9 SHRL $0x10, R13 LEAQ 1(R8), R10 MOVBLZX R13, R11 ADDL $0x04, R12 LEAL 18(R11), R11 CMPL R12, $0x13 CMOVLEQ R11, R12 CMOVQEQ R10, R8 JMP decodeBlockAsm_fast_copy_exec decodeBlockAsm_fast_copy_2: MOVQ R11, R12 CMPL R11, $0x3d JB decodeBlockAsm_fast_copy_2_0_extra JEQ decodeBlockAsm_fast_copy_2_1_extra CMPL R12, $0x3f JB decodeBlockAsm_fast_copy_2_2_extra MOVWQZX 1(R8), R9 MOVL 2(R8), R12 ADDQ $0x06, R8 SHRL $0x08, R12 LEAL 64(R12), R12 ADDQ $0x40, R9 JMP decodeBlockAsm_fast_copy_exec_long_long decodeBlockAsm_fast_copy_2_2_extra: MOVWQZX 1(R8), R9 MOVWLZX 3(R8), R12 ADDQ $0x05, R8 LEAL 64(R12), R12 ADDQ $0x40, R9 JMP decodeBlockAsm_fast_copy_exec_long_long decodeBlockAsm_fast_copy_2_1_extra: MOVL R13, R12 SHRL $0x08, R13 SHRL $0x18, R12 MOVWQZX R13, R9 ADDQ $0x04, R8 LEAL 64(R12), R12 ADDQ $0x40, R9 JMP decodeBlockAsm_fast_copy_exec_long_long decodeBlockAsm_fast_copy_2_0_extra: SHRL $0x08, R13 MOVWQZX R13, R9 LEAQ 3(R8), R8 LEAL 4(R12), R12 ADDQ $0x40, R9 JMP decodeBlockAsm_fast_copy_short_no_ol decodeBlockAsm_fast_copy_3: MOVL R13, R9 ADDQ $0x04, R8 MOVQ R11, R10 SHRQ $0x01, R10 ANDQ $0x03, R10 BTL $0x00, R11 JC decodeBlockAsm_fast_copy3_read SHRL $0x03, R11 ANDL $0x07, R11 LEAL 4(R11), R12 SHRL $0x08, R13 MOVWQZX R13, R9 DECQ R8 INCQ R10 MOVL (R8), R11 MOVL R11, (SI) ADDQ $0x40, R9 ADDQ R10, R8 ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_fast_copy_short_no_ol decodeBlockAsm_fast_copy3_read: MOVL R9, R12 SHRL $0x05, R12 ANDL $0x3f, R12 SHRL $0x0b, R9 ADDL $0x00010000, R9 CMPL R12, $0x3d JB decodeBlockAsm_fast_copy_3_0_extra JEQ decodeBlockAsm_fast_copy_3_1_extra CMPL R12, $0x3e JEQ decodeBlockAsm_fast_copy_3_2_extra MOVL -1(R8), R12 ADDQ $0x03, R8 SHRL $0x08, R12 LEAL 64(R12), R12 JMP decodeBlockAsm_fast_copy_fused_long decodeBlockAsm_fast_copy_3_2_extra: MOVWLZX (R8), R12 ADDQ $0x02, R8 LEAL 64(R12), R12 JMP decodeBlockAsm_fast_copy_fused_long decodeBlockAsm_fast_copy_3_1_extra: MOVBLZX (R8), R12 ADDQ $0x01, R8 LEAL 64(R12), R12 JMP decodeBlockAsm_fast_copy_fused_long decodeBlockAsm_fast_copy_3_0_extra: LEAL 4(R12), R12 MOVL (R8), R11 MOVL R11, (SI) ADDQ R10, R8 ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_fast_copy_short_no_ol decodeBlockAsm_fast_copy_fused_long: MOVL (R8), R11 MOVL R11, (SI) ADDQ R10, R8 ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_fast_copy_exec_long_long decodeBlockAsm_fast_copy_exec_short: CMPL R9, DI JA corrupt LEAQ (SI)(R12*1), R10 CMPQ R10, AX JA corrupt // Prefetch next tag MOVBQZX (R8), R10 MOVQ SI, R11 SUBQ R9, R11 CMPL R9, R12 JB decodeBlockAsm_fast_copy_overlap JMP decodeBlockAsm_fast_copy_short decodeBlockAsm_fast_copy_exec_long_long: MOVQ SI, R11 SUBQ R9, R11 CMPL R9, DI JA corrupt LEAQ (SI)(R12*1), R10 CMPQ R10, AX JA corrupt // Prefetch next tag MOVBQZX (R8), R10 // genMemMoveLong MOVQ R12, R13 SHRQ $0x05, R13 MOVQ SI, R14 MOVQ R12, R15 decodeBlockAsm_fast_copy_long_longlarge_big_loop_back: MOVOU (R11), X0 MOVOU 16(R11), X1 MOVOU X0, (R14) MOVOU X1, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 SUBQ $0x20, R15 DECQ R13 JNZ decodeBlockAsm_fast_copy_long_longlarge_big_loop_back TESTQ R15, R15 JZ decodeBlockAsm_fast_copy_done MOVOU -32(R11)(R15*1), X0 MOVOU -16(R11)(R15*1), X1 MOVOU X0, -32(R14)(R15*1) MOVOU X1, -16(R14)(R15*1) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_short_no_ol: MOVQ SI, R11 SUBQ R9, R11 CMPL R9, DI JA corrupt LEAQ (SI)(R12*1), R10 CMPQ R10, AX JA corrupt // Prefetch next tag MOVBQZX (R8), R10 // genMemMoveShort // margin: 16, min move: 4 CMPQ R12, $0x10 JBE decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16 CMPQ R12, $0x20 JBE decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32 JMP decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64 decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16: MOVOU (R11), X0 MOVOU X0, (SI) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32: MOVOU (R11), X0 MOVOU -16(R11)(R12*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(R12*1) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64: MOVOU (R11), X0 MOVOU 16(R11), X1 MOVOU -32(R11)(R12*1), X2 MOVOU -16(R11)(R12*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_exec: CMPL R9, DI JA corrupt LEAQ (SI)(R12*1), R10 CMPQ R10, AX JA corrupt MOVQ SI, R11 SUBQ R9, R11 // Prefetch next tag MOVBQZX (R8), R10 CMPL R9, R12 JB decodeBlockAsm_fast_copy_overlap CMPL R12, $0x40 JA decodeBlockAsm_fast_copy_long decodeBlockAsm_fast_copy_short: // genMemMoveShort // margin: 16, min move: 1 CMPQ R12, $0x10 JBE decodeBlockAsm_fast_copy_short_memmove_move_8through16 CMPQ R12, $0x20 JBE decodeBlockAsm_fast_copy_short_memmove_move_17through32 JMP decodeBlockAsm_fast_copy_short_memmove_move_33through64 decodeBlockAsm_fast_copy_short_memmove_move_8through16: MOVOU (R11), X0 MOVOU X0, (SI) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_short_memmove_move_17through32: MOVOU (R11), X0 MOVOU -16(R11)(R12*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(R12*1) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_short_memmove_move_33through64: MOVOU (R11), X0 MOVOU 16(R11), X1 MOVOU -32(R11)(R12*1), X2 MOVOU -16(R11)(R12*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) JMP decodeBlockAsm_fast_copy_done decodeBlockAsm_fast_copy_long: // genMemMoveLong MOVOU (R11), X0 MOVOU 16(R11), X1 MOVOU -32(R11)(R12*1), X2 MOVOU -16(R11)(R12*1), X3 MOVQ R12, R14 SHRQ $0x05, R14 MOVQ SI, R13 ANDL $0x0000001f, R13 MOVQ $0x00000040, R15 SUBQ R13, R15 DECQ R14 JA decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32 LEAQ -32(R11)(R15*1), R13 LEAQ -32(SI)(R15*1), BP decodeBlockAsm_fast_copy_longlarge_big_loop_back: MOVOU (R13), X4 MOVOU 16(R13), X5 MOVOA X4, (BP) MOVOA X5, 16(BP) ADDQ $0x20, BP ADDQ $0x20, R13 ADDQ $0x20, R15 DECQ R14 JNA decodeBlockAsm_fast_copy_longlarge_big_loop_back decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32: MOVOU -32(R11)(R15*1), X4 MOVOU -16(R11)(R15*1), X5 MOVOA X4, -32(SI)(R15*1) MOVOA X5, -16(SI)(R15*1) ADDQ $0x20, R15 CMPQ R12, R15 JAE decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) decodeBlockAsm_fast_copy_done: ADDQ R12, SI ADDQ R12, DI MOVQ R10, R11 SHRQ $0x02, R11 CMPQ R8, DX JB decodeBlockAsm_fast_loop_nofetch JMP decodeBlockAsm_fast_end_copy decodeBlockAsm_fast_copy_overlap: CMPL R9, $0x03 JA decodeBlockAsm_fast_copy_overlap_4 JE decodeBlockAsm_fast_copy_overlap_3 CMPL R9, $0x02 JE decodeBlockAsm_fast_copy_overlap_2 MOVB (R11), R11 ADDQ R12, DI decodeBlockAsm_fast_loop_overlap_1: MOVB R11, (SI) INCQ SI DECQ R12 JNZ decodeBlockAsm_fast_loop_overlap_1 MOVQ R10, R11 SHRQ $0x02, R11 CMPQ R8, DX JB decodeBlockAsm_fast_loop_nofetch JMP decodeBlockAsm_fast_end_copy decodeBlockAsm_fast_copy_overlap_2: MOVW (R11), R13 ADDQ R12, DI BTL $0x00, R12 JNC decodeBlockAsm_fast_loop_overlap_2 MOVB R13, (SI) MOVW 1(R11), R13 INCQ SI DECQ R12 decodeBlockAsm_fast_loop_overlap_2: MOVW R13, (SI) ADDQ $0x02, SI SUBQ $0x02, R12 JNZ decodeBlockAsm_fast_loop_overlap_2 MOVQ R10, R11 SHRQ $0x02, R11 CMPQ R8, DX JB decodeBlockAsm_fast_loop_nofetch JMP decodeBlockAsm_fast_end_copy decodeBlockAsm_fast_copy_overlap_3: MOVL (R11), R13 ADDQ R12, DI SUBQ $0x03, R12 decodeBlockAsm_fast_loop_overlap_3: MOVL R13, (SI) ADDQ $0x03, SI SUBQ $0x03, R12 JA decodeBlockAsm_fast_loop_overlap_3 MOVW 3(R11)(R12*1), R13 MOVW R13, (SI)(R12*1) MOVB 5(R11)(R12*1), R13 MOVB R13, 2(SI)(R12*1) LEAQ 3(SI)(R12*1), SI MOVQ R10, R11 SHRQ $0x02, R11 CMPQ R8, DX JB decodeBlockAsm_fast_loop_nofetch JMP decodeBlockAsm_fast_end_copy decodeBlockAsm_fast_copy_overlap_4: ADDQ R12, DI SUBQ $0x04, R12 decodeBlockAsm_fast_loop_overlap_4: MOVL (R11), R13 ADDQ $0x04, R11 MOVL R13, (SI) ADDQ $0x04, SI SUBQ $0x04, R12 JA decodeBlockAsm_fast_loop_overlap_4 MOVL (R11)(R12*1), R13 MOVL R13, (SI)(R12*1) LEAQ 4(SI)(R12*1), SI MOVQ R10, R11 SHRQ $0x02, R11 CMPQ R8, DX JB decodeBlockAsm_fast_loop_nofetch decodeBlockAsm_fast_end_copy: decodeBlockAsm_fast_end_done: decodeBlockAsm_remain_loop: CMPQ R8, CX JAE decodeBlockAsm_remain_end_copy MOVBQZX (R8), DX MOVQ DX, BX SHRQ $0x02, BX CMPQ SI, AX JAE decodeBlockAsm_remain_end_copy ANDQ $0x03, DX JNZ decodeBlockAsm_remain_copy decodeBlockAsm_remain_lits: MOVL BX, DX SHRL $0x01, DX CMPL DX, $0x1d JB decodeBlockAsm_remain_lit_0 JEQ decodeBlockAsm_remain_lit_1 CMPL DX, $0x1e JEQ decodeBlockAsm_remain_lit_2 JMP decodeBlockAsm_remain_lit_3 decodeBlockAsm_remain_lit_0: INCQ R8 INCL DX LEAQ (SI)(DX*1), R10 CMPQ R10, AX JA corrupt BTL $0x00, BX JC decodeBlockAsm_remain_copy_exec_short LEAQ (R8)(DX*1), BX CMPQ BX, CX JA corrupt // genMemMoveShort // margin: -1, min move: 1 CMPQ DX, $0x03 JB decodeBlockAsm_remain_lit_0_copy_memmove_move_1or2 JE decodeBlockAsm_remain_lit_0_copy_memmove_move_3 CMPQ DX, $0x08 JBE decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8 CMPQ DX, $0x10 JBE decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16 CMPQ DX, $0x20 JBE decodeBlockAsm_remain_lit_0_copy_memmove_move_17through32 JMP decodeBlockAsm_remain_lit_0_copy_memmove_move_33through64 decodeBlockAsm_remain_lit_0_copy_memmove_move_1or2: MOVB (R8), BL MOVB -1(R8)(DX*1), R10 MOVB BL, (SI) MOVB R10, -1(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_0_copy_memmove_move_3: MOVW (R8), BX MOVB 2(R8), R10 MOVW BX, (SI) MOVB R10, 2(SI) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8: MOVL (R8), BX MOVL -4(R8)(DX*1), R10 MOVL BX, (SI) MOVL R10, -4(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16: MOVQ (R8), BX MOVQ -8(R8)(DX*1), R10 MOVQ BX, (SI) MOVQ R10, -8(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_0_copy_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DX*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_0_copy_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DX*1), X2 MOVOU -16(R8)(DX*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_1: ADDQ $0x02, R8 CMPQ R8, CX JA corrupt MOVBQZX -1(R8), DX JMP decodeBlockAsm_remain_litcopy_long decodeBlockAsm_remain_lit_2: ADDQ $0x03, R8 CMPQ R8, CX JA corrupt MOVWQZX -2(R8), DX JMP decodeBlockAsm_remain_litcopy_long decodeBlockAsm_remain_lit_3: ADDQ $0x04, R8 CMPQ R8, CX JA corrupt MOVL -4(R8), DX SHRL $0x08, DX decodeBlockAsm_remain_litcopy_long: LEAQ 30(DX), DX LEAQ (SI)(DX*1), R10 CMPQ R10, AX JA corrupt BTL $0x00, BX JC decodeBlockAsm_remain_copy_exec LEAQ (R8)(DX*1), BX CMPQ BX, CX JA corrupt CMPL DX, $0x40 JBE decodeBlockAsm_remain_litcopy_short_reduced // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DX*1), X2 MOVOU -16(R8)(DX*1), X3 MOVQ DX, R10 SHRQ $0x05, R10 MOVQ SI, BX ANDL $0x0000001f, BX MOVQ $0x00000040, R11 SUBQ BX, R11 DECQ R10 JA decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), BX LEAQ -32(SI)(R11*1), R12 decodeBlockAsm_remain_litcopy_longlarge_big_loop_back: MOVOU (BX), X4 MOVOU 16(BX), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, BX ADDQ $0x20, R11 DECQ R10 JNA decodeBlockAsm_remain_litcopy_longlarge_big_loop_back decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(SI)(R11*1) MOVOA X5, -16(SI)(R11*1) ADDQ $0x20, R11 CMPQ DX, R11 JAE decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_litcopy_short_reduced: // genMemMoveShort // margin: -4, min move: 30 CMPQ DX, $0x20 JBE decodeBlockAsm_remain_lit_longer_copy_memmove_move_17through32 JMP decodeBlockAsm_remain_lit_longer_copy_memmove_move_33through64 decodeBlockAsm_remain_lit_longer_copy_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DX*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done decodeBlockAsm_remain_lit_longer_copy_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DX*1), X2 MOVOU -16(R8)(DX*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) decodeBlockAsm_remain_litcopy_done: ADDQ DX, R8 ADDQ DX, SI ADDQ DX, DI CMPQ R8, CX JAE decodeBlockAsm_remain_end_done MOVBQZX (R8), DX MOVQ DX, BX SHRQ $0x02, BX CMPQ SI, AX JAE decodeBlockAsm_remain_end_done ANDQ $0x03, DX JZ decodeBlockAsm_remain_lits decodeBlockAsm_remain_copy: CMPL DX, $0x02 JB decodeBlockAsm_remain_copy_1 JEQ decodeBlockAsm_remain_copy_2 JMP decodeBlockAsm_remain_copy_3 decodeBlockAsm_remain_copy_1: ADDQ $0x02, R8 CMPQ R8, CX JA corrupt MOVWQZX -2(R8), R9 MOVQ BX, DX ANDL $0x0f, DX SHRL $0x06, R9 INCL R9 CMPL DX, $0x0f JNE decodeBlockAsm_remain_copy_1_short ADDQ $0x01, R8 CMPQ R8, CX JA corrupt MOVBLZX -1(R8), DX LEAL 18(DX), DX JMP decodeBlockAsm_remain_copy_exec decodeBlockAsm_remain_copy_1_short: LEAL 4(DX), DX JMP decodeBlockAsm_remain_copy_exec_short decodeBlockAsm_remain_copy_2: MOVQ BX, DX CMPL BX, $0x3d JB decodeBlockAsm_remain_copy_2_0_extra JEQ decodeBlockAsm_remain_copy_2_1_extra CMPL DX, $0x3f JB decodeBlockAsm_remain_copy_2_2_extra ADDQ $0x06, R8 CMPQ R8, CX JA corrupt MOVWQZX -5(R8), R9 MOVL -4(R8), DX SHRL $0x08, DX LEAL 64(DX), DX ADDQ $0x40, R9 JMP decodeBlockAsm_remain_copy_exec_long_long decodeBlockAsm_remain_copy_2_2_extra: ADDQ $0x05, R8 CMPQ R8, CX JA corrupt MOVWQZX -4(R8), R9 MOVWLZX -2(R8), DX LEAL 64(DX), DX ADDQ $0x40, R9 JMP decodeBlockAsm_remain_copy_exec_long_long decodeBlockAsm_remain_copy_2_1_extra: ADDQ $0x04, R8 CMPQ R8, CX JA corrupt MOVWQZX -3(R8), R9 MOVBLZX -1(R8), DX LEAL 64(DX), DX ADDQ $0x40, R9 JMP decodeBlockAsm_remain_copy_exec_long_long decodeBlockAsm_remain_copy_2_0_extra: LEAQ 3(R8), R8 CMPQ R8, CX JA corrupt MOVWQZX -2(R8), R9 LEAL 4(DX), DX ADDQ $0x40, R9 JMP decodeBlockAsm_remain_copy_short_no_ol decodeBlockAsm_remain_copy_3: ADDQ $0x04, R8 CMPQ R8, CX JA corrupt MOVL -4(R8), R9 MOVQ BX, R10 SHRQ $0x01, R10 ANDQ $0x03, R10 BTL $0x00, BX JC decodeBlockAsm_remain_copy3_read SHRL $0x03, BX ANDL $0x07, BX LEAL 4(BX), DX MOVWQZX -3(R8), R9 DECQ R8 INCQ R10 LEAQ (R8)(R10*1), BX LEAQ (SI)(R10*1), R11 CMPQ BX, CX JA corrupt CMPQ R11, AX JA corrupt // genMemMoveVeryShort CMPQ R10, $0x03 JE decodeBlockAsm_remain_copy2_fused_lits_move_3 JA decodeBlockAsm_remain_copy2_fused_lits_move_4 MOVB (R8), BL MOVB -1(R8)(R10*1), R11 MOVB BL, (SI) MOVB R11, -1(SI)(R10*1) JMP decodeBlockAsm_remain_copy2_fused_lits_done decodeBlockAsm_remain_copy2_fused_lits_move_3: MOVW (R8), BX MOVB 2(R8), R11 MOVW BX, (SI) MOVB R11, 2(SI) JMP decodeBlockAsm_remain_copy2_fused_lits_done decodeBlockAsm_remain_copy2_fused_lits_move_4: MOVL (R8), BX MOVL BX, (SI) decodeBlockAsm_remain_copy2_fused_lits_done: ADDQ $0x40, R9 ADDQ R10, R8 ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_remain_copy_short_no_ol decodeBlockAsm_remain_copy3_read: MOVL R9, DX SHRL $0x05, DX ANDL $0x3f, DX SHRL $0x0b, R9 ADDL $0x00010000, R9 CMPL DX, $0x3d JB decodeBlockAsm_remain_copy_3_0_extra JEQ decodeBlockAsm_remain_copy_3_1_extra CMPL DX, $0x3e JEQ decodeBlockAsm_remain_copy_3_2_extra ADDQ $0x03, R8 CMPQ R8, CX JA corrupt MOVL -4(R8), DX SHRL $0x08, DX LEAL 64(DX), DX JMP decodeBlockAsm_remain_copy_fused_long decodeBlockAsm_remain_copy_3_2_extra: ADDQ $0x02, R8 CMPQ R8, CX JA corrupt MOVWLZX -2(R8), DX LEAL 64(DX), DX JMP decodeBlockAsm_remain_copy_fused_long decodeBlockAsm_remain_copy_3_1_extra: ADDQ $0x01, R8 CMPQ R8, CX JA corrupt MOVBLZX -1(R8), DX LEAL 64(DX), DX JMP decodeBlockAsm_remain_copy_fused_long decodeBlockAsm_remain_copy_3_0_extra: LEAL 4(DX), DX TESTL R10, R10 JZ decodeBlockAsm_remain_copy_short_no_ol LEAQ (R8)(R10*1), BX LEAQ (SI)(R10*1), R11 CMPQ BX, CX JA corrupt CMPQ R11, AX JA corrupt // genMemMoveVeryShort CMPQ R10, $0x03 JE decodeBlockAsm_remain_copy3s_fused_lits_move_3 JA decodeBlockAsm_remain_copy3s_fused_lits_move_4 MOVB (R8), BL MOVB -1(R8)(R10*1), R11 MOVB BL, (SI) MOVB R11, -1(SI)(R10*1) JMP decodeBlockAsm_remain_copy3s_fused_lits_done decodeBlockAsm_remain_copy3s_fused_lits_move_3: MOVW (R8), BX MOVB 2(R8), R11 MOVW BX, (SI) MOVB R11, 2(SI) JMP decodeBlockAsm_remain_copy3s_fused_lits_done decodeBlockAsm_remain_copy3s_fused_lits_move_4: MOVL (R8), BX MOVL BX, (SI) decodeBlockAsm_remain_copy3s_fused_lits_done: ADDQ R10, R8 ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_remain_copy_short_no_ol decodeBlockAsm_remain_copy_fused_long: TESTL R10, R10 JZ decodeBlockAsm_remain_copy_exec_long_long LEAQ (R8)(R10*1), BX LEAQ (SI)(R10*1), R11 CMPQ BX, CX JA corrupt CMPQ R11, AX JA corrupt // genMemMoveVeryShort CMPQ R10, $0x03 JE decodeBlockAsm_remain_copy3_fused_lits_move_3 JA decodeBlockAsm_remain_copy3_fused_lits_move_4 MOVB (R8), BL MOVB -1(R8)(R10*1), R11 MOVB BL, (SI) MOVB R11, -1(SI)(R10*1) JMP decodeBlockAsm_remain_copy3_fused_lits_done decodeBlockAsm_remain_copy3_fused_lits_move_3: MOVW (R8), BX MOVB 2(R8), R11 MOVW BX, (SI) MOVB R11, 2(SI) JMP decodeBlockAsm_remain_copy3_fused_lits_done decodeBlockAsm_remain_copy3_fused_lits_move_4: MOVL (R8), BX MOVL BX, (SI) decodeBlockAsm_remain_copy3_fused_lits_done: ADDQ R10, R8 ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_remain_copy_exec_long_long decodeBlockAsm_remain_copy_exec_short: CMPL R9, DI JA corrupt LEAQ (SI)(DX*1), BX CMPQ BX, AX JA corrupt MOVQ SI, BX SUBQ R9, BX CMPL R9, DX JB decodeBlockAsm_remain_copy_overlap JMP decodeBlockAsm_remain_copy_short decodeBlockAsm_remain_copy_exec_long_long: MOVQ SI, BX SUBQ R9, BX CMPL R9, DI JA corrupt LEAQ (SI)(DX*1), R10 CMPQ R10, AX JA corrupt // genMemMoveLong MOVQ DX, R10 SHRQ $0x05, R10 MOVQ SI, R11 MOVQ DX, R12 decodeBlockAsm_remain_copy_long_longlarge_big_loop_back: MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU X0, (R11) MOVOU X1, 16(R11) ADDQ $0x20, R11 ADDQ $0x20, BX SUBQ $0x20, R12 DECQ R10 JNZ decodeBlockAsm_remain_copy_long_longlarge_big_loop_back TESTQ R12, R12 JZ decodeBlockAsm_remain_copy_done MOVOU -32(BX)(R12*1), X0 MOVOU -16(BX)(R12*1), X1 MOVOU X0, -32(R11)(R12*1) MOVOU X1, -16(R11)(R12*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_no_ol: MOVQ SI, BX SUBQ R9, BX CMPL R9, DI JA corrupt LEAQ (SI)(DX*1), R10 CMPQ R10, AX JA corrupt // genMemMoveShort // margin: -4, min move: 4 CMPQ DX, $0x08 JBE decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8 CMPQ DX, $0x10 JBE decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16 CMPQ DX, $0x20 JBE decodeBlockAsm_remain_copy_short_no_ol_memmove_move_17through32 JMP decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64 decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8: MOVL (BX), R10 MOVL -4(BX)(DX*1), BX MOVL R10, (SI) MOVL BX, -4(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16: MOVQ (BX), R10 MOVQ -8(BX)(DX*1), BX MOVQ R10, (SI) MOVQ BX, -8(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_no_ol_memmove_move_17through32: MOVOU (BX), X0 MOVOU -16(BX)(DX*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64: MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU -32(BX)(DX*1), X2 MOVOU -16(BX)(DX*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_exec: CMPL R9, DI JA corrupt LEAQ (SI)(DX*1), BX CMPQ BX, AX JA corrupt MOVQ SI, BX SUBQ R9, BX CMPL R9, DX JB decodeBlockAsm_remain_copy_overlap CMPL DX, $0x40 JA decodeBlockAsm_remain_copy_long decodeBlockAsm_remain_copy_short: // genMemMoveShort // margin: -4, min move: 1 CMPQ DX, $0x03 JB decodeBlockAsm_remain_copy_short_memmove_move_1or2 JE decodeBlockAsm_remain_copy_short_memmove_move_3 CMPQ DX, $0x08 JBE decodeBlockAsm_remain_copy_short_memmove_move_4through8 CMPQ DX, $0x10 JBE decodeBlockAsm_remain_copy_short_memmove_move_8through16 CMPQ DX, $0x20 JBE decodeBlockAsm_remain_copy_short_memmove_move_17through32 JMP decodeBlockAsm_remain_copy_short_memmove_move_33through64 decodeBlockAsm_remain_copy_short_memmove_move_1or2: MOVB (BX), R10 MOVB -1(BX)(DX*1), BL MOVB R10, (SI) MOVB BL, -1(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_memmove_move_3: MOVW (BX), R10 MOVB 2(BX), BL MOVW R10, (SI) MOVB BL, 2(SI) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_memmove_move_4through8: MOVL (BX), R10 MOVL -4(BX)(DX*1), BX MOVL R10, (SI) MOVL BX, -4(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_memmove_move_8through16: MOVQ (BX), R10 MOVQ -8(BX)(DX*1), BX MOVQ R10, (SI) MOVQ BX, -8(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_memmove_move_17through32: MOVOU (BX), X0 MOVOU -16(BX)(DX*1), X1 MOVOU X0, (SI) MOVOU X1, -16(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_short_memmove_move_33through64: MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU -32(BX)(DX*1), X2 MOVOU -16(BX)(DX*1), X3 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done decodeBlockAsm_remain_copy_long: // genMemMoveLong MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU -32(BX)(DX*1), X2 MOVOU -16(BX)(DX*1), X3 MOVQ DX, R11 SHRQ $0x05, R11 MOVQ SI, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32 LEAQ -32(BX)(R12*1), R10 LEAQ -32(SI)(R12*1), R13 decodeBlockAsm_remain_copy_longlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA decodeBlockAsm_remain_copy_longlarge_big_loop_back decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32: MOVOU -32(BX)(R12*1), X4 MOVOU -16(BX)(R12*1), X5 MOVOA X4, -32(SI)(R12*1) MOVOA X5, -16(SI)(R12*1) ADDQ $0x20, R12 CMPQ DX, R12 JAE decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32 MOVOU X0, (SI) MOVOU X1, 16(SI) MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) decodeBlockAsm_remain_copy_done: ADDQ DX, SI ADDQ DX, DI JMP decodeBlockAsm_remain_loop decodeBlockAsm_remain_copy_overlap: ADDQ DX, DI decodeBlockAsm_remain_copy_overlap_simple: MOVB (BX), R10 MOVB R10, (SI) INCQ BX INCQ SI DECQ DX JNZ decodeBlockAsm_remain_copy_overlap_simple JMP decodeBlockAsm_remain_loop decodeBlockAsm_remain_end_copy: decodeBlockAsm_remain_end_done: MOVQ src_base+24(FP), AX MOVQ src_len+32(FP), CX MOVQ dst_base+0(FP), DX MOVQ dst_len+8(FP), BX LEAQ (DX)(BX*1), DX LEAQ (AX)(CX*1), AX CMPQ SI, DX JNE corrupt CMPQ R8, AX JNE corrupt MOVQ $0x00000000, ret+48(FP) RET corrupt: MOVQ $0x00000001, ret+48(FP) RET