Full Mattermost server source with integrated Community Enterprise features. Includes vendor directory for offline/air-gapped builds. Structure: - enterprise-impl/: Enterprise feature implementations - enterprise-community/: Init files that register implementations - enterprise/: Bridge imports (community_imports.go) - vendor/: All dependencies for offline builds Build (online): go build ./cmd/mattermost Build (offline/air-gapped): go build -mod=vendor ./cmd/mattermost 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
20792 lines
512 KiB
ArmAsm
20792 lines
512 KiB
ArmAsm
// Code generated by command: go run gen.go -out ../asm_amd64.s -stubs ../asm_amd64.go -pkg=minlz. DO NOT EDIT.
|
|
|
|
//go:build !appengine && !noasm && gc && !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// func _dummy_()
|
|
TEXT ·_dummy_(SB), $0
|
|
#ifdef GOAMD64_v4
|
|
#ifndef GOAMD64_v3
|
|
#define GOAMD64_v3
|
|
#endif
|
|
#endif
|
|
RET
|
|
|
|
// func encodeBlockAsm(dst []byte, src []byte, tmp *[131072]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000400, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm
|
|
MOVQ (BX)(DX*1), DI
|
|
LEAL -2162685(DX), R8
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R10
|
|
MOVQ DI, R11
|
|
MOVQ DI, R12
|
|
SHRQ $0x08, R12
|
|
SHLQ $0x10, R11
|
|
IMULQ R10, R11
|
|
SHRQ $0x31, R11
|
|
SHLQ $0x10, R12
|
|
IMULQ R10, R12
|
|
SHRQ $0x31, R12
|
|
MOVL (AX)(R11*4), SI
|
|
MOVL (AX)(R12*4), R9
|
|
MOVL DX, (AX)(R11*4)
|
|
MOVL DX, (AX)(R12*4)
|
|
MOVQ DI, R11
|
|
SHRQ $0x10, R11
|
|
SHLQ $0x10, R11
|
|
IMULQ R10, R11
|
|
SHRQ $0x31, R11
|
|
MOVL DX, R10
|
|
SUBL 16(SP), R10
|
|
MOVL 1(BX)(R10*1), R12
|
|
MOVQ DI, R10
|
|
SHRQ $0x08, R10
|
|
CMPL R10, R12
|
|
JNE no_repeat_found_encodeBlockAsm
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm
|
|
|
|
repeat_extend_back_end_encodeBlockAsm:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 4(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm
|
|
JB repeat_extend_forward_end_encodeBlockAsm
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm
|
|
|
|
no_repeat_found_encodeBlockAsm:
|
|
CMPL SI, R8
|
|
JLE offset_ok_0_encodeBlockAsm
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm
|
|
|
|
offset_ok_0_encodeBlockAsm:
|
|
SHRQ $0x08, DI
|
|
MOVL (AX)(R11*4), SI
|
|
LEAL 2(DX), R10
|
|
CMPL R9, R8
|
|
JLE offset_ok_1_encodeBlockAsm
|
|
CMPL (BX)(R9*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm
|
|
|
|
offset_ok_1_encodeBlockAsm:
|
|
MOVL R10, (AX)(R11*4)
|
|
SHRQ $0x08, DI
|
|
CMPL SI, R8
|
|
JLE offset_ok_2_encodeBlockAsm
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm
|
|
|
|
offset_ok_2_encodeBlockAsm:
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm
|
|
|
|
candidate3_match_encodeBlockAsm:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm
|
|
|
|
candidate2_match_encodeBlockAsm:
|
|
MOVL R10, (AX)(R11*4)
|
|
INCL DX
|
|
MOVL R9, SI
|
|
|
|
candidate_match_encodeBlockAsm:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm
|
|
|
|
match_extend_back_loop_encodeBlockAsm:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm
|
|
JMP match_extend_back_loop_encodeBlockAsm
|
|
|
|
match_extend_back_end_encodeBlockAsm:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm
|
|
JB match_nolit_end_encodeBlockAsm
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm
|
|
MOVL (DI), DI
|
|
CMPL SI, $0x0001003f
|
|
JBE match_emit_copy2lits_encodeBlockAsm
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(SI), SI
|
|
SHLL $0x0b, SI
|
|
LEAL 7(SI)(R8*8), SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_emit_lits_encodeBlockAsm
|
|
LEAL -60(R11), R9
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_emit_lits_encodeBlockAsm
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_emit_lits_encodeBlockAsm
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL R9, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm
|
|
|
|
emit_copy3_2_match_emit_lits_encodeBlockAsm:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW R9, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm
|
|
|
|
emit_copy3_1_match_emit_lits_encodeBlockAsm:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB R9, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm
|
|
|
|
emit_copy3_0_match_emit_lits_encodeBlockAsm:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
|
|
match_emit_copy_litsencodeBlockAsm:
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
match_emit_copy2lits_encodeBlockAsm:
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
match_emit_lits_copy_encodeBlockAsm:
|
|
LEAQ 4(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_match_emit_encodeBlockAsm
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
three_bytes_match_emit_encodeBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
two_bytes_match_emit_encodeBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
one_byte_match_emit_encodeBlockAsm:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm
|
|
|
|
memmove_long_match_emit_encodeBlockAsm:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm:
|
|
// emitCopy
|
|
CMPL SI, $0x0001003f
|
|
JBE two_byte_offset_match_nolit_encodeBlockAsm
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(SI), SI
|
|
SHLL $0x0b, SI
|
|
ADDL $0x07, SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_nolit_encodeBlockAsm_emit3
|
|
LEAL -60(R11), DI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_nolit_encodeBlockAsm_emit3
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_nolit_encodeBlockAsm_emit3
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL DI, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy3_2_match_nolit_encodeBlockAsm_emit3:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW DI, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy3_1_match_nolit_encodeBlockAsm_emit3:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB DI, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy3_0_match_nolit_encodeBlockAsm_emit3:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
two_byte_offset_match_nolit_encodeBlockAsm:
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
two_byte_match_nolit_encodeBlockAsm:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x10, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x31, R8
|
|
SHLQ $0x10, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x31, R9
|
|
LEAL -2(DX), R10
|
|
MOVL (AX)(R9*4), SI
|
|
MOVL R10, (AX)(R8*4)
|
|
MOVL DX, (AX)(R9*4)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
LEAL -2162687(R8), R9
|
|
CMPL SI, R9
|
|
JA match_nolit_len_okencodeBlockAsm
|
|
JMP search_loop_encodeBlockAsm
|
|
|
|
match_nolit_len_okencodeBlockAsm:
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm
|
|
JB match_nolit2_end_encodeBlockAsm
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm
|
|
|
|
emit_remainder_encodeBlockAsm:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 4(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm
|
|
CMPL BX, $0x00010000
|
|
JB three_bytes_emit_remainder_encodeBlockAsm
|
|
MOVL BX, SI
|
|
SHRL $0x10, SI
|
|
MOVB $0xf8, (CX)
|
|
MOVW BX, 1(CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm2MB(dst []byte, src []byte, tmp *[131072]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm2MB(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000400, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm2MB:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm2MB
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm2MB:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm2MB
|
|
MOVQ (BX)(DX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x31, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x31, R11
|
|
MOVL (AX)(R10*4), SI
|
|
MOVL (AX)(R11*4), R8
|
|
MOVL DX, (AX)(R10*4)
|
|
MOVL DX, (AX)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x31, R10
|
|
MOVL DX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(BX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm2MB
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm2MB
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm2MB:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm2MB
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm2MB
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm2MB
|
|
|
|
repeat_extend_back_end_encodeBlockAsm2MB:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 4(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm2MB
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm2MB
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm2MB
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm2MB
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm2MB:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm2MB
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm2MB:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm2MB
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm2MB:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm2MB:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm2MB
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm2MB
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm2MB:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm2MB
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm2MB
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm2MB:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm2MB
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm2MB
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm2MB:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm2MB
|
|
JB repeat_extend_forward_end_encodeBlockAsm2MB
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm2MB
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm2MB:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm2MB
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm2MB:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm2MB
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm2MB
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm2MB
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm2MB
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm2MB
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm2MB:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm2MB:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm2MB
|
|
|
|
no_repeat_found_encodeBlockAsm2MB:
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm2MB
|
|
SHRQ $0x08, DI
|
|
MOVL (AX)(R10*4), SI
|
|
LEAL 2(DX), R9
|
|
CMPL (BX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm2MB
|
|
MOVL R9, (AX)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm2MB
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm2MB
|
|
|
|
candidate3_match_encodeBlockAsm2MB:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm2MB
|
|
|
|
candidate2_match_encodeBlockAsm2MB:
|
|
MOVL R9, (AX)(R10*4)
|
|
INCL DX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm2MB:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm2MB
|
|
|
|
match_extend_back_loop_encodeBlockAsm2MB:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm2MB
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm2MB
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm2MB
|
|
JMP match_extend_back_loop_encodeBlockAsm2MB
|
|
|
|
match_extend_back_end_encodeBlockAsm2MB:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm2MB:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm2MB
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm2MB
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm2MB
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm2MB
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm2MB:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm2MB
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm2MB
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm2MB
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm2MB:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm2MB
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm2MB
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm2MB:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm2MB
|
|
JB match_nolit_end_encodeBlockAsm2MB
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm2MB
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm2MB:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm2MB
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm2MB:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm2MB
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm2MB
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm2MB
|
|
MOVL (DI), DI
|
|
CMPL SI, $0x0001003f
|
|
JBE match_emit_copy2lits_encodeBlockAsm2MB
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(SI), SI
|
|
SHLL $0x0b, SI
|
|
LEAL 7(SI)(R8*8), SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_emit_lits_encodeBlockAsm2MB
|
|
LEAL -60(R11), R9
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_emit_lits_encodeBlockAsm2MB
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_emit_lits_encodeBlockAsm2MB
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL R9, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm2MB
|
|
|
|
emit_copy3_2_match_emit_lits_encodeBlockAsm2MB:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW R9, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm2MB
|
|
|
|
emit_copy3_1_match_emit_lits_encodeBlockAsm2MB:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB R9, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm2MB
|
|
|
|
emit_copy3_0_match_emit_lits_encodeBlockAsm2MB:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
|
|
match_emit_copy_litsencodeBlockAsm2MB:
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
match_emit_copy2lits_encodeBlockAsm2MB:
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm2MB
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
match_emit_lits_copy_encodeBlockAsm2MB:
|
|
LEAQ 4(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm2MB
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm2MB
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_match_emit_encodeBlockAsm2MB
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm2MB
|
|
|
|
three_bytes_match_emit_encodeBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm2MB
|
|
|
|
two_bytes_match_emit_encodeBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm2MB
|
|
JMP memmove_long_match_emit_encodeBlockAsm2MB
|
|
|
|
one_byte_match_emit_encodeBlockAsm2MB:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm2MB:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm2MB
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm2MB:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm2MB:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm2MB
|
|
|
|
memmove_long_match_emit_encodeBlockAsm2MB:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm2MB:
|
|
// emitCopy
|
|
CMPL SI, $0x0001003f
|
|
JBE two_byte_offset_match_nolit_encodeBlockAsm2MB
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(SI), SI
|
|
SHLL $0x0b, SI
|
|
ADDL $0x07, SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_nolit_encodeBlockAsm2MB_emit3
|
|
LEAL -60(R11), DI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_nolit_encodeBlockAsm2MB_emit3
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_nolit_encodeBlockAsm2MB_emit3
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL DI, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy3_2_match_nolit_encodeBlockAsm2MB_emit3:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW DI, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy3_1_match_nolit_encodeBlockAsm2MB_emit3:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB DI, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy3_0_match_nolit_encodeBlockAsm2MB_emit3:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
two_byte_offset_match_nolit_encodeBlockAsm2MB:
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm2MB
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm2MB
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm2MB:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm2MB
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm2MB:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
two_byte_match_nolit_encodeBlockAsm2MB:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm2MB_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm2MB_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm2MB_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm2MB_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm2MB_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm2MB
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm2MB_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm2MB:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm2MB
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm2MB
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm2MB:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x10, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x31, R8
|
|
SHLQ $0x10, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x31, R9
|
|
LEAL -2(DX), R10
|
|
MOVL (AX)(R9*4), SI
|
|
MOVL R10, (AX)(R8*4)
|
|
MOVL DX, (AX)(R9*4)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm2MB
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm2MB
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm2MB
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm2MB:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm2MB
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm2MB
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm2MB:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm2MB
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm2MB
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm2MB:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm2MB
|
|
JB match_nolit2_end_encodeBlockAsm2MB
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm2MB
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm2MB
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm2MB:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm2MB
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm2MB:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm2MB
|
|
|
|
emit_remainder_encodeBlockAsm2MB:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm2MB
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 4(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm2MB
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm2MB
|
|
CMPL BX, $0x00010000
|
|
JB three_bytes_emit_remainder_encodeBlockAsm2MB
|
|
MOVL BX, SI
|
|
SHRL $0x10, SI
|
|
MOVB $0xf8, (CX)
|
|
MOVW BX, 1(CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm2MB
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm2MB
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm2MB
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm2MB
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm2MB:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm2MB:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm2MB
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm2MB:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm2MB:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm2MB
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm2MB:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm2MB:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm512K(dst []byte, src []byte, tmp *[65536]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm512K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000200, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm512K:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm512K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm512K:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm512K
|
|
MOVQ (BX)(DX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x32, R11
|
|
MOVL (AX)(R10*4), SI
|
|
MOVL (AX)(R11*4), R8
|
|
MOVL DX, (AX)(R10*4)
|
|
MOVL DX, (AX)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL DX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(BX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm512K
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm512K
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm512K:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm512K
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm512K
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm512K
|
|
|
|
repeat_extend_back_end_encodeBlockAsm512K:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 4(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm512K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm512K
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm512K
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm512K
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm512K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm512K
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm512K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm512K
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm512K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm512K:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm512K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm512K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm512K
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm512K
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm512K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm512K
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm512K:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm512K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm512K
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm512K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm512K
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm512K:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm512K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm512K
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm512K:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm512K
|
|
JB repeat_extend_forward_end_encodeBlockAsm512K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm512K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm512K
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm512K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm512K
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm512K:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm512K
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm512K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm512K
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm512K
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm512K
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm512K:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm512K:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm512K
|
|
|
|
no_repeat_found_encodeBlockAsm512K:
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm512K
|
|
SHRQ $0x08, DI
|
|
MOVL (AX)(R10*4), SI
|
|
LEAL 2(DX), R9
|
|
CMPL (BX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm512K
|
|
MOVL R9, (AX)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm512K
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm512K
|
|
|
|
candidate3_match_encodeBlockAsm512K:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm512K
|
|
|
|
candidate2_match_encodeBlockAsm512K:
|
|
MOVL R9, (AX)(R10*4)
|
|
INCL DX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm512K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm512K
|
|
|
|
match_extend_back_loop_encodeBlockAsm512K:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm512K
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm512K
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm512K
|
|
JMP match_extend_back_loop_encodeBlockAsm512K
|
|
|
|
match_extend_back_end_encodeBlockAsm512K:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm512K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm512K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm512K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm512K
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm512K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm512K
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm512K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm512K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm512K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm512K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm512K
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm512K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm512K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm512K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm512K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm512K
|
|
JB match_nolit_end_encodeBlockAsm512K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm512K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm512K
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm512K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm512K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm512K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm512K
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm512K
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm512K
|
|
MOVL (DI), DI
|
|
CMPL SI, $0x0001003f
|
|
JBE match_emit_copy2lits_encodeBlockAsm512K
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(SI), SI
|
|
SHLL $0x0b, SI
|
|
LEAL 7(SI)(R8*8), SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_emit_lits_encodeBlockAsm512K
|
|
LEAL -60(R11), R9
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_emit_lits_encodeBlockAsm512K
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_emit_lits_encodeBlockAsm512K
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL R9, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm512K
|
|
|
|
emit_copy3_2_match_emit_lits_encodeBlockAsm512K:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW R9, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm512K
|
|
|
|
emit_copy3_1_match_emit_lits_encodeBlockAsm512K:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB R9, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_emit_copy_litsencodeBlockAsm512K
|
|
|
|
emit_copy3_0_match_emit_lits_encodeBlockAsm512K:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
|
|
match_emit_copy_litsencodeBlockAsm512K:
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
match_emit_copy2lits_encodeBlockAsm512K:
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm512K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
match_emit_lits_copy_encodeBlockAsm512K:
|
|
LEAQ 4(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm512K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm512K
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_match_emit_encodeBlockAsm512K
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm512K
|
|
|
|
three_bytes_match_emit_encodeBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm512K
|
|
|
|
two_bytes_match_emit_encodeBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm512K
|
|
JMP memmove_long_match_emit_encodeBlockAsm512K
|
|
|
|
one_byte_match_emit_encodeBlockAsm512K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm512K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm512K
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm512K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm512K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm512K
|
|
|
|
memmove_long_match_emit_encodeBlockAsm512K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm512K:
|
|
// emitCopy
|
|
CMPL SI, $0x0001003f
|
|
JBE two_byte_offset_match_nolit_encodeBlockAsm512K
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(SI), SI
|
|
SHLL $0x0b, SI
|
|
ADDL $0x07, SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_nolit_encodeBlockAsm512K_emit3
|
|
LEAL -60(R11), DI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_nolit_encodeBlockAsm512K_emit3
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_nolit_encodeBlockAsm512K_emit3
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL DI, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy3_2_match_nolit_encodeBlockAsm512K_emit3:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW DI, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy3_1_match_nolit_encodeBlockAsm512K_emit3:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB DI, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy3_0_match_nolit_encodeBlockAsm512K_emit3:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
two_byte_offset_match_nolit_encodeBlockAsm512K:
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm512K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm512K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm512K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm512K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm512K:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
two_byte_match_nolit_encodeBlockAsm512K:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm512K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm512K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm512K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm512K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm512K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm512K
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm512K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm512K:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm512K
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm512K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm512K:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x10, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x32, R8
|
|
SHLQ $0x10, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x32, R9
|
|
LEAL -2(DX), R10
|
|
MOVL (AX)(R9*4), SI
|
|
MOVL R10, (AX)(R8*4)
|
|
MOVL DX, (AX)(R9*4)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm512K
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm512K:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm512K
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm512K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm512K
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm512K
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm512K
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm512K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm512K
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm512K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm512K
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm512K
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm512K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm512K
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm512K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm512K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm512K
|
|
JB match_nolit2_end_encodeBlockAsm512K
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm512K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm512K
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm512K:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm512K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm512K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm512K
|
|
|
|
emit_remainder_encodeBlockAsm512K:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm512K
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 4(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm512K
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm512K
|
|
CMPL BX, $0x00010000
|
|
JB three_bytes_emit_remainder_encodeBlockAsm512K
|
|
MOVL BX, SI
|
|
SHRL $0x10, SI
|
|
MOVB $0xf8, (CX)
|
|
MOVW BX, 1(CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm512K
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm512K
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm512K
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm512K
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm512K:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm512K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm512K
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm512K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm512K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm512K
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm512K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm512K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm64K(dst []byte, src []byte, tmp *[16384]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm64K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000080, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm64K:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm64K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm64K:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm64K
|
|
MOVQ (BX)(DX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x33, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x33, R11
|
|
MOVWLZX (AX)(R10*2), SI
|
|
MOVWLZX (AX)(R11*2), R8
|
|
MOVW DX, (AX)(R10*2)
|
|
MOVW DX, (AX)(R11*2)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x33, R10
|
|
MOVL DX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(BX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm64K
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm64K
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm64K:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm64K
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm64K
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm64K
|
|
|
|
repeat_extend_back_end_encodeBlockAsm64K:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 4(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm64K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm64K
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm64K
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm64K
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm64K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm64K
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm64K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm64K
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm64K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm64K:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm64K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm64K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm64K
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm64K
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm64K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm64K
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm64K:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm64K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm64K
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm64K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm64K
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm64K:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm64K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm64K
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm64K:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm64K
|
|
JB repeat_extend_forward_end_encodeBlockAsm64K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm64K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm64K
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm64K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm64K
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm64K:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm64K
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm64K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm64K
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm64K
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm64K
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm64K:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm64K:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm64K
|
|
|
|
no_repeat_found_encodeBlockAsm64K:
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm64K
|
|
SHRQ $0x08, DI
|
|
MOVWLZX (AX)(R10*2), SI
|
|
LEAL 2(DX), R9
|
|
CMPL (BX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm64K
|
|
MOVW R9, (AX)(R10*2)
|
|
SHRQ $0x08, DI
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm64K
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm64K
|
|
|
|
candidate3_match_encodeBlockAsm64K:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm64K
|
|
|
|
candidate2_match_encodeBlockAsm64K:
|
|
MOVW R9, (AX)(R10*2)
|
|
INCL DX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm64K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm64K
|
|
|
|
match_extend_back_loop_encodeBlockAsm64K:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm64K
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm64K
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm64K
|
|
JMP match_extend_back_loop_encodeBlockAsm64K
|
|
|
|
match_extend_back_end_encodeBlockAsm64K:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm64K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm64K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm64K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm64K
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm64K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm64K
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm64K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm64K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm64K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm64K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm64K
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm64K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm64K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm64K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm64K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm64K
|
|
JB match_nolit_end_encodeBlockAsm64K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm64K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm64K
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm64K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm64K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm64K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm64K
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm64K
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm64K
|
|
MOVL (DI), DI
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm64K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
match_emit_lits_copy_encodeBlockAsm64K:
|
|
LEAQ 4(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm64K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm64K
|
|
JB three_bytes_match_emit_encodeBlockAsm64K
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm64K
|
|
|
|
three_bytes_match_emit_encodeBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm64K
|
|
|
|
two_bytes_match_emit_encodeBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm64K
|
|
JMP memmove_long_match_emit_encodeBlockAsm64K
|
|
|
|
one_byte_match_emit_encodeBlockAsm64K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm64K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm64K
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm64K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm64K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm64K
|
|
|
|
memmove_long_match_emit_encodeBlockAsm64K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm64K:
|
|
// emitCopy
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm64K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm64K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm64K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm64K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm64K:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
two_byte_match_nolit_encodeBlockAsm64K:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm64K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm64K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm64K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm64K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm64K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm64K
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm64K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm64K:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm64K
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm64K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm64K:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x10, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x33, R8
|
|
SHLQ $0x10, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x33, R9
|
|
LEAL -2(DX), R10
|
|
MOVWLZX (AX)(R9*2), SI
|
|
MOVW R10, (AX)(R8*2)
|
|
MOVW DX, (AX)(R9*2)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm64K
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm64K:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm64K
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm64K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm64K
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm64K
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm64K
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm64K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm64K
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm64K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm64K
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm64K
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm64K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm64K
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm64K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm64K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm64K
|
|
JB match_nolit2_end_encodeBlockAsm64K
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm64K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm64K
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm64K:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm64K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm64K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm64K
|
|
|
|
emit_remainder_encodeBlockAsm64K:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm64K
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 4(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm64K
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm64K
|
|
JB three_bytes_emit_remainder_encodeBlockAsm64K
|
|
MOVL BX, SI
|
|
SHRL $0x10, SI
|
|
MOVB $0xf8, (CX)
|
|
MOVW BX, 1(CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm64K
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm64K
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm64K
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm64K
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm64K:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm64K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm64K
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm64K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm64K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm64K
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm64K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm64K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm16K(dst []byte, src []byte, tmp *[8192]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm16K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000040, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm16K:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm16K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm16K:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm16K
|
|
MOVQ (BX)(DX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x000000cf1bbcdcbb, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x18, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x18, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x34, R11
|
|
MOVWLZX (AX)(R10*2), SI
|
|
MOVWLZX (AX)(R11*2), R8
|
|
MOVW DX, (AX)(R10*2)
|
|
MOVW DX, (AX)(R11*2)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x18, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
MOVL DX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(BX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm16K
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm16K
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm16K:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm16K
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm16K
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm16K
|
|
|
|
repeat_extend_back_end_encodeBlockAsm16K:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 3(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm16K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm16K
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm16K
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm16K
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm16K
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm16K
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm16K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm16K
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm16K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm16K
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm16K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm16K:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm16K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm16K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm16K
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm16K
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm16K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm16K
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm16K:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm16K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm16K
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm16K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm16K
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm16K:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm16K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm16K
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm16K:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm16K
|
|
JB repeat_extend_forward_end_encodeBlockAsm16K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm16K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm16K
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm16K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm16K
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm16K:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm16K
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm16K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm16K
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm16K
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm16K
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm16K:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm16K:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm16K
|
|
|
|
no_repeat_found_encodeBlockAsm16K:
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm16K
|
|
SHRQ $0x08, DI
|
|
MOVWLZX (AX)(R10*2), SI
|
|
LEAL 2(DX), R9
|
|
CMPL (BX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm16K
|
|
MOVW R9, (AX)(R10*2)
|
|
SHRQ $0x08, DI
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm16K
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm16K
|
|
|
|
candidate3_match_encodeBlockAsm16K:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm16K
|
|
|
|
candidate2_match_encodeBlockAsm16K:
|
|
MOVW R9, (AX)(R10*2)
|
|
INCL DX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm16K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm16K
|
|
|
|
match_extend_back_loop_encodeBlockAsm16K:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm16K
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm16K
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm16K
|
|
JMP match_extend_back_loop_encodeBlockAsm16K
|
|
|
|
match_extend_back_end_encodeBlockAsm16K:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm16K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm16K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm16K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm16K
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm16K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm16K
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm16K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm16K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm16K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm16K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm16K
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm16K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm16K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm16K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm16K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm16K
|
|
JB match_nolit_end_encodeBlockAsm16K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm16K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm16K
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm16K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm16K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm16K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm16K
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm16K
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm16K
|
|
MOVL (DI), DI
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm16K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
match_emit_lits_copy_encodeBlockAsm16K:
|
|
LEAQ 3(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm16K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm16K
|
|
JB three_bytes_match_emit_encodeBlockAsm16K
|
|
|
|
three_bytes_match_emit_encodeBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm16K
|
|
|
|
two_bytes_match_emit_encodeBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm16K
|
|
JMP memmove_long_match_emit_encodeBlockAsm16K
|
|
|
|
one_byte_match_emit_encodeBlockAsm16K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm16K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm16K
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm16K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm16K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm16K
|
|
|
|
memmove_long_match_emit_encodeBlockAsm16K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm16K:
|
|
// emitCopy
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm16K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm16K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm16K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm16K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm16K:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
two_byte_match_nolit_encodeBlockAsm16K:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm16K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm16K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm16K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm16K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm16K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm16K
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm16K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm16K:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm16K
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm16K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm16K:
|
|
MOVQ $0x000000cf1bbcdcbb, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x18, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x34, R8
|
|
SHLQ $0x18, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x34, R9
|
|
LEAL -2(DX), R10
|
|
MOVWLZX (AX)(R9*2), SI
|
|
MOVW R10, (AX)(R8*2)
|
|
MOVW DX, (AX)(R9*2)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm16K
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm16K:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm16K
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm16K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm16K
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm16K
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm16K
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm16K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm16K
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm16K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm16K
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm16K
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm16K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm16K
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm16K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm16K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm16K
|
|
JB match_nolit2_end_encodeBlockAsm16K
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm16K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm16K
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm16K:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm16K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm16K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm16K
|
|
|
|
emit_remainder_encodeBlockAsm16K:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm16K
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 3(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm16K
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm16K
|
|
JB three_bytes_emit_remainder_encodeBlockAsm16K
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm16K
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm16K
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm16K
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm16K:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm16K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm16K
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm16K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm16K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm16K
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm16K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm16K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm4K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000010, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm4K:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm4K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm4K:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm4K
|
|
MOVQ (BX)(DX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x36, R11
|
|
MOVWLZX (AX)(R10*2), SI
|
|
MOVWLZX (AX)(R11*2), R8
|
|
MOVW DX, (AX)(R10*2)
|
|
MOVW DX, (AX)(R11*2)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
MOVL DX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(BX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm4K
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm4K
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm4K:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm4K
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm4K
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm4K
|
|
|
|
repeat_extend_back_end_encodeBlockAsm4K:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 3(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm4K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm4K
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm4K
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm4K
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm4K
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm4K
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm4K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm4K
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm4K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm4K
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm4K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm4K:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm4K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4K
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm4K
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm4K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm4K
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm4K:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm4K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4K
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm4K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm4K
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm4K:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm4K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm4K
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm4K:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm4K
|
|
JB repeat_extend_forward_end_encodeBlockAsm4K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm4K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm4K
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm4K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm4K
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm4K:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm4K
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm4K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm4K
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm4K
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm4K
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm4K:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm4K:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm4K
|
|
|
|
no_repeat_found_encodeBlockAsm4K:
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm4K
|
|
SHRQ $0x08, DI
|
|
MOVWLZX (AX)(R10*2), SI
|
|
LEAL 2(DX), R9
|
|
CMPL (BX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm4K
|
|
MOVW R9, (AX)(R10*2)
|
|
SHRQ $0x08, DI
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm4K
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm4K
|
|
|
|
candidate3_match_encodeBlockAsm4K:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm4K
|
|
|
|
candidate2_match_encodeBlockAsm4K:
|
|
MOVW R9, (AX)(R10*2)
|
|
INCL DX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm4K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm4K
|
|
|
|
match_extend_back_loop_encodeBlockAsm4K:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm4K
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm4K
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm4K
|
|
JMP match_extend_back_loop_encodeBlockAsm4K
|
|
|
|
match_extend_back_end_encodeBlockAsm4K:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm4K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm4K
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm4K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm4K
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm4K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm4K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm4K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm4K
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm4K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm4K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm4K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm4K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm4K
|
|
JB match_nolit_end_encodeBlockAsm4K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm4K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm4K
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm4K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm4K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm4K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm4K
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm4K
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm4K
|
|
MOVL (DI), DI
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm4K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
match_emit_lits_copy_encodeBlockAsm4K:
|
|
LEAQ 3(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm4K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm4K
|
|
JB three_bytes_match_emit_encodeBlockAsm4K
|
|
|
|
three_bytes_match_emit_encodeBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm4K
|
|
|
|
two_bytes_match_emit_encodeBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm4K
|
|
JMP memmove_long_match_emit_encodeBlockAsm4K
|
|
|
|
one_byte_match_emit_encodeBlockAsm4K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm4K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm4K
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm4K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm4K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm4K
|
|
|
|
memmove_long_match_emit_encodeBlockAsm4K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm4K:
|
|
// emitCopy
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm4K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm4K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm4K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm4K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm4K:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
two_byte_match_nolit_encodeBlockAsm4K:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm4K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm4K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm4K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm4K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm4K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4K
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm4K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm4K:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm4K
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm4K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm4K:
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x20, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x36, R8
|
|
SHLQ $0x20, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x36, R9
|
|
LEAL -2(DX), R10
|
|
MOVWLZX (AX)(R9*2), SI
|
|
MOVW R10, (AX)(R8*2)
|
|
MOVW DX, (AX)(R9*2)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm4K
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm4K:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm4K
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm4K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm4K
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm4K
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm4K
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm4K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm4K
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm4K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm4K
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm4K
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm4K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm4K
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm4K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm4K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm4K
|
|
JB match_nolit2_end_encodeBlockAsm4K
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm4K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm4K
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm4K:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm4K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm4K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm4K
|
|
|
|
emit_remainder_encodeBlockAsm4K:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm4K
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 3(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm4K
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm4K
|
|
JB three_bytes_emit_remainder_encodeBlockAsm4K
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm4K
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm4K
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm4K
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm4K:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm4K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm4K
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm4K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm4K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm4K
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm4K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm4K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBlockAsm1K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000008, DX
|
|
MOVQ AX, BX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm1K:
|
|
MOVOU X0, (BX)
|
|
MOVOU X0, 16(BX)
|
|
MOVOU X0, 32(BX)
|
|
MOVOU X0, 48(BX)
|
|
MOVOU X0, 64(BX)
|
|
MOVOU X0, 80(BX)
|
|
MOVOU X0, 96(BX)
|
|
MOVOU X0, 112(BX)
|
|
ADDQ $0x80, BX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBlockAsm1K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), DX
|
|
LEAQ -17(DX), BX
|
|
LEAQ -17(DX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, DX
|
|
SUBL DX, BX
|
|
LEAQ (CX)(BX*1), BX
|
|
MOVQ BX, (SP)
|
|
MOVL $0x00000001, DX
|
|
MOVL DX, 16(SP)
|
|
MOVQ src_base+24(FP), BX
|
|
|
|
search_loop_encodeBlockAsm1K:
|
|
MOVL DX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x04, SI
|
|
LEAL 4(DX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm1K
|
|
MOVQ (BX)(DX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x37, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x37, R11
|
|
MOVWLZX (AX)(R10*2), SI
|
|
MOVWLZX (AX)(R11*2), R8
|
|
MOVW DX, (AX)(R10*2)
|
|
MOVW DX, (AX)(R11*2)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x37, R10
|
|
MOVL DX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(BX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm1K
|
|
LEAL 1(DX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeBlockAsm1K
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm1K:
|
|
CMPL DI, SI
|
|
JBE repeat_extend_back_end_encodeBlockAsm1K
|
|
MOVB -1(BX)(R8*1), R9
|
|
MOVB -1(BX)(DI*1), R10
|
|
CMPB R9, R10
|
|
JNE repeat_extend_back_end_encodeBlockAsm1K
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm1K
|
|
|
|
repeat_extend_back_end_encodeBlockAsm1K:
|
|
MOVL DI, SI
|
|
MOVL 12(SP), R8
|
|
SUBL R8, SI
|
|
LEAQ 3(CX)(SI*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_1
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_1:
|
|
LEAQ (BX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_repeat_emit_lits_encodeBlockAsm1K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_repeat_emit_lits_encodeBlockAsm1K
|
|
JB three_bytes_repeat_emit_lits_encodeBlockAsm1K
|
|
|
|
three_bytes_repeat_emit_lits_encodeBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm1K
|
|
|
|
two_bytes_repeat_emit_lits_encodeBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midrepeat_emit_lits_encodeBlockAsm1K
|
|
JMP memmove_long_repeat_emit_lits_encodeBlockAsm1K
|
|
|
|
one_byte_repeat_emit_lits_encodeBlockAsm1K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm1K
|
|
|
|
memmove_midrepeat_emit_lits_encodeBlockAsm1K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K:
|
|
MOVQ R9, CX
|
|
JMP repeat_emit_lits_end_encodeBlockAsm1K
|
|
|
|
memmove_long_repeat_emit_lits_encodeBlockAsm1K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R10
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ SI, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
repeat_emit_lits_end_encodeBlockAsm1K:
|
|
ADDL $0x05, DX
|
|
MOVL DX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL DX, R8
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBlockAsm1K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm1K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm1K
|
|
LEAL -16(R8), R8
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K:
|
|
CMPL R8, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm1K
|
|
JMP matchlen_match8_repeat_extend_encodeBlockAsm1K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm1K
|
|
|
|
matchlen_match8_repeat_extend_encodeBlockAsm1K:
|
|
CMPL R8, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBlockAsm1K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm1K
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_repeat_extend_encodeBlockAsm1K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeBlockAsm1K
|
|
|
|
matchlen_match4_repeat_extend_encodeBlockAsm1K:
|
|
CMPL R8, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBlockAsm1K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_repeat_extend_encodeBlockAsm1K
|
|
LEAL -4(R8), R8
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_repeat_extend_encodeBlockAsm1K:
|
|
CMPL R8, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBlockAsm1K
|
|
JB repeat_extend_forward_end_encodeBlockAsm1K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_repeat_extend_encodeBlockAsm1K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, R8
|
|
JZ repeat_extend_forward_end_encodeBlockAsm1K
|
|
|
|
matchlen_match1_repeat_extend_encodeBlockAsm1K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeBlockAsm1K
|
|
LEAL 1(R11), R11
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm1K:
|
|
ADDL R11, DX
|
|
MOVL DX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), DI
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBlockAsm1K
|
|
LEAL -30(SI), DI
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBlockAsm1K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBlockAsm1K
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBlockAsm1K
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBlockAsm1K
|
|
|
|
repeat_one_match_repeat_encodeBlockAsm1K:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(SI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBlockAsm1K:
|
|
MOVL DX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm1K
|
|
|
|
no_repeat_found_encodeBlockAsm1K:
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm1K
|
|
SHRQ $0x08, DI
|
|
MOVWLZX (AX)(R10*2), SI
|
|
LEAL 2(DX), R9
|
|
CMPL (BX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm1K
|
|
MOVW R9, (AX)(R10*2)
|
|
SHRQ $0x08, DI
|
|
CMPL (BX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm1K
|
|
MOVL 20(SP), DX
|
|
JMP search_loop_encodeBlockAsm1K
|
|
|
|
candidate3_match_encodeBlockAsm1K:
|
|
ADDL $0x02, DX
|
|
JMP candidate_match_encodeBlockAsm1K
|
|
|
|
candidate2_match_encodeBlockAsm1K:
|
|
MOVW R9, (AX)(R10*2)
|
|
INCL DX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm1K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm1K
|
|
|
|
match_extend_back_loop_encodeBlockAsm1K:
|
|
CMPL DX, DI
|
|
JBE match_extend_back_end_encodeBlockAsm1K
|
|
MOVB -1(BX)(SI*1), R8
|
|
MOVB -1(BX)(DX*1), R9
|
|
CMPB R8, R9
|
|
JNE match_extend_back_end_encodeBlockAsm1K
|
|
LEAL -1(DX), DX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm1K
|
|
JMP match_extend_back_loop_encodeBlockAsm1K
|
|
|
|
match_extend_back_end_encodeBlockAsm1K:
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_2
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_2:
|
|
MOVL DX, R8
|
|
MOVL DX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R9
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBlockAsm1K:
|
|
MOVQ (R9)(R11*1), R10
|
|
MOVQ 8(R9)(R11*1), R12
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm1K
|
|
XORQ 8(SI)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBlockAsm1K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBlockAsm1K
|
|
JMP matchlen_match8_match_nolit_encodeBlockAsm1K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm1K
|
|
|
|
matchlen_match8_match_nolit_encodeBlockAsm1K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBlockAsm1K
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm1K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBlockAsm1K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBlockAsm1K
|
|
|
|
matchlen_match4_match_nolit_encodeBlockAsm1K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBlockAsm1K
|
|
MOVL (R9)(R11*1), R10
|
|
CMPL (SI)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBlockAsm1K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBlockAsm1K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBlockAsm1K
|
|
JB match_nolit_end_encodeBlockAsm1K
|
|
MOVW (R9)(R11*1), R10
|
|
CMPW (SI)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBlockAsm1K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBlockAsm1K
|
|
|
|
matchlen_match1_match_nolit_encodeBlockAsm1K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE match_nolit_end_encodeBlockAsm1K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBlockAsm1K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL 16(SP), SI
|
|
MOVL 12(SP), DI
|
|
MOVL DX, 12(SP)
|
|
SUBL DI, R8
|
|
JZ match_nolits_copy_encodeBlockAsm1K
|
|
LEAQ (BX)(DI*1), DI
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_copy_encodeBlockAsm1K
|
|
CMPL SI, $0x40
|
|
JB match_emit_lits_copy_encodeBlockAsm1K
|
|
MOVL (DI), DI
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, SI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, SI
|
|
CMOVLLT R11, SI
|
|
LEAL -1(R8)(SI*4), SI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(SI*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL DI, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBlockAsm1K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
match_emit_lits_copy_encodeBlockAsm1K:
|
|
LEAQ 3(CX)(R8*1), R9
|
|
CMPQ R9, (SP)
|
|
JB dst_size_check_ok_3
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_3:
|
|
// emitLiteral
|
|
LEAL -1(R8), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBlockAsm1K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBlockAsm1K
|
|
JB three_bytes_match_emit_encodeBlockAsm1K
|
|
|
|
three_bytes_match_emit_encodeBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBlockAsm1K
|
|
|
|
two_bytes_match_emit_encodeBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBlockAsm1K
|
|
JMP memmove_long_match_emit_encodeBlockAsm1K
|
|
|
|
one_byte_match_emit_encodeBlockAsm1K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16:
|
|
MOVOU (DI), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm1K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm1K
|
|
|
|
memmove_midmatch_emit_encodeBlockAsm1K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBlockAsm1K:
|
|
MOVQ R9, CX
|
|
JMP match_nolits_copy_encodeBlockAsm1K
|
|
|
|
memmove_long_match_emit_encodeBlockAsm1K:
|
|
LEAQ (CX)(R8*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R8*1), X2
|
|
MOVOU -16(DI)(R8*1), X3
|
|
MOVQ R8, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R13*1), X4
|
|
MOVOU -16(DI)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R9, CX
|
|
|
|
match_nolits_copy_encodeBlockAsm1K:
|
|
// emitCopy
|
|
CMPL SI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBlockAsm1K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBlockAsm1K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
emit_one_longer_match_nolit_encodeBlockAsm1K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBlockAsm1K
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBlockAsm1K:
|
|
LEAL -1(SI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
two_byte_match_nolit_encodeBlockAsm1K:
|
|
// emitCopy2
|
|
LEAL -64(SI), SI
|
|
LEAL -4(R11), R11
|
|
MOVW SI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBlockAsm1K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBlockAsm1K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBlockAsm1K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
emit_copy2_2_match_nolit_encodeBlockAsm1K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
emit_copy2_1_match_nolit_encodeBlockAsm1K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm1K
|
|
|
|
emit_copy2_0_match_nolit_encodeBlockAsm1K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm1K:
|
|
CMPL DX, 8(SP)
|
|
JAE emit_remainder_encodeBlockAsm1K
|
|
MOVQ -2(BX)(DX*1), DI
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBlockAsm1K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm1K:
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, R9
|
|
SHLQ $0x20, R8
|
|
IMULQ SI, R8
|
|
SHRQ $0x37, R8
|
|
SHLQ $0x20, R9
|
|
IMULQ SI, R9
|
|
SHRQ $0x37, R9
|
|
LEAL -2(DX), R10
|
|
MOVWLZX (AX)(R9*2), SI
|
|
MOVW R10, (AX)(R8*2)
|
|
MOVW DX, (AX)(R9*2)
|
|
MOVL DX, R8
|
|
INCL DX
|
|
CMPL (BX)(SI*1), DI
|
|
JNE search_loop_encodeBlockAsm1K
|
|
MOVL R8, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
CMPQ CX, (SP)
|
|
JB dst_size_check_ok_4
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_4:
|
|
ADDL $0x03, DX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL DX, DI
|
|
LEAQ (BX)(DX*1), R8
|
|
LEAQ (BX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K
|
|
|
|
matchlen_loopback_16_match_nolit2_encodeBlockAsm1K:
|
|
MOVQ (R8)(R11*1), R9
|
|
MOVQ 8(R8)(R11*1), R10
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm1K
|
|
XORQ 8(SI)(R11*1), R10
|
|
JNZ matchlen_bsf_16match_nolit2_encodeBlockAsm1K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm1K
|
|
JMP matchlen_match8_match_nolit2_encodeBlockAsm1K
|
|
|
|
matchlen_bsf_16match_nolit2_encodeBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL 8(R11)(R10*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm1K
|
|
|
|
matchlen_match8_match_nolit2_encodeBlockAsm1K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit2_encodeBlockAsm1K
|
|
MOVQ (R8)(R11*1), R9
|
|
XORQ (SI)(R11*1), R9
|
|
JNZ matchlen_bsf_8_match_nolit2_encodeBlockAsm1K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit2_encodeBlockAsm1K
|
|
|
|
matchlen_bsf_8_match_nolit2_encodeBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R11)(R9*1), R11
|
|
JMP match_nolit2_end_encodeBlockAsm1K
|
|
|
|
matchlen_match4_match_nolit2_encodeBlockAsm1K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit2_encodeBlockAsm1K
|
|
MOVL (R8)(R11*1), R9
|
|
CMPL (SI)(R11*1), R9
|
|
JNE matchlen_match2_match_nolit2_encodeBlockAsm1K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit2_encodeBlockAsm1K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit2_encodeBlockAsm1K
|
|
JB match_nolit2_end_encodeBlockAsm1K
|
|
MOVW (R8)(R11*1), R9
|
|
CMPW (SI)(R11*1), R9
|
|
JNE matchlen_match1_match_nolit2_encodeBlockAsm1K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit2_end_encodeBlockAsm1K
|
|
|
|
matchlen_match1_match_nolit2_encodeBlockAsm1K:
|
|
MOVB (R8)(R11*1), R9
|
|
CMPB (SI)(R11*1), R9
|
|
JNE match_nolit2_end_encodeBlockAsm1K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit2_end_encodeBlockAsm1K:
|
|
ADDL R11, DX
|
|
ADDL $0x04, R11
|
|
MOVL DX, 12(SP)
|
|
MOVL 16(SP), SI
|
|
JMP match_nolits_copy_encodeBlockAsm1K
|
|
|
|
emit_remainder_encodeBlockAsm1K:
|
|
MOVQ src_len+32(FP), AX
|
|
MOVL 12(SP), DX
|
|
SUBL DX, AX
|
|
JZ emit_remainder_end_encodeBlockAsm1K
|
|
LEAQ (BX)(DX*1), DX
|
|
LEAQ 3(CX)(AX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB dst_size_check_ok_5
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
dst_size_check_ok_5:
|
|
// emitLiteral
|
|
LEAL -1(AX), BX
|
|
CMPL BX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBlockAsm1K
|
|
SUBL $0x1d, BX
|
|
CMPL BX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBlockAsm1K
|
|
JB three_bytes_emit_remainder_encodeBlockAsm1K
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, BX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm1K
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDL $0x1d, BX
|
|
ADDQ $0x02, CX
|
|
CMPL BX, $0x40
|
|
JB memmove_midemit_remainder_encodeBlockAsm1K
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm1K
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm1K:
|
|
SHLB $0x03, BL
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ AX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_3
|
|
CMPQ AX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8
|
|
CMPQ AX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_1or2:
|
|
MOVB (DX), SI
|
|
MOVB -1(DX)(AX*1), DL
|
|
MOVB SI, (CX)
|
|
MOVB DL, -1(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_3:
|
|
MOVW (DX), SI
|
|
MOVB 2(DX), DL
|
|
MOVW SI, (CX)
|
|
MOVB DL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8:
|
|
MOVL (DX), SI
|
|
MOVL -4(DX)(AX*1), DX
|
|
MOVL SI, (CX)
|
|
MOVL DX, -4(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16:
|
|
MOVQ (DX), SI
|
|
MOVQ -8(DX)(AX*1), DX
|
|
MOVQ SI, (CX)
|
|
MOVQ DX, -8(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm1K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm1K
|
|
|
|
memmove_midemit_remainder_encodeBlockAsm1K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ AX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(AX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(AX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBlockAsm1K:
|
|
MOVQ BX, CX
|
|
JMP emit_remainder_end_encodeBlockAsm1K
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm1K:
|
|
LEAQ (CX)(AX*1), BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(AX*1), X2
|
|
MOVOU -16(DX)(AX*1), X3
|
|
MOVQ AX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R8*1), X4
|
|
MOVOU -16(DX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ AX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(AX*1)
|
|
MOVOU X3, -16(CX)(AX*1)
|
|
MOVQ BX, CX
|
|
|
|
emit_remainder_end_encodeBlockAsm1K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00001200, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -17(AX), DX
|
|
LEAQ -17(AX), DI
|
|
MOVL DI, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm:
|
|
MOVQ tmp+48(FP), DI
|
|
MOVL AX, R8
|
|
SUBL 12(SP), R8
|
|
SHRL $0x08, R8
|
|
CMPL R8, $0x63
|
|
JBE check_maxskip_ok_encodeBetterBlockAsm
|
|
LEAL 100(AX), R8
|
|
JMP check_maxskip_cont_encodeBetterBlockAsm
|
|
|
|
check_maxskip_ok_encodeBetterBlockAsm:
|
|
LEAL 1(AX)(R8*1), R8
|
|
|
|
check_maxskip_cont_encodeBetterBlockAsm:
|
|
CMPL R8, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm
|
|
MOVQ (DX)(AX*1), R9
|
|
MOVL R8, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R11
|
|
MOVQ $0x9e3779b1, R8
|
|
MOVQ R9, R12
|
|
MOVQ R9, R13
|
|
SHLQ $0x08, R12
|
|
IMULQ R11, R12
|
|
SHRQ $0x2f, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x32, R13
|
|
MOVL (DI)(R12*4), R8
|
|
MOVL 524288(DI)(R13*4), R10
|
|
MOVL AX, (DI)(R12*4)
|
|
MOVL AX, 524288(DI)(R13*4)
|
|
LEAL -2162685(AX), R12
|
|
CMPL R8, R12
|
|
JLE offset_ok_0_encodeBetterBlockAsm
|
|
MOVQ (DX)(R8*1), BX
|
|
CMPQ BX, R9
|
|
JEQ candidate_match_encodeBetterBlockAsm
|
|
|
|
offset_ok_0_encodeBetterBlockAsm:
|
|
CMPL R10, R12
|
|
JLE offset_ok_1_encodeBetterBlockAsm
|
|
MOVQ (DX)(R10*1), SI
|
|
CMPQ SI, R9
|
|
|
|
offset_ok_1_encodeBetterBlockAsm:
|
|
MOVL AX, R13
|
|
SUBL 16(SP), R13
|
|
MOVQ (DX)(R13*1), R13
|
|
MOVQ $0x000000ffffffff00, R14
|
|
XORQ R9, R13
|
|
TESTQ R14, R13
|
|
JNE no_repeat_found_encodeBetterBlockAsm
|
|
LEAL 1(AX), DI
|
|
MOVL 12(SP), R8
|
|
MOVL DI, R9
|
|
SUBL 16(SP), R9
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm:
|
|
CMPL DI, R8
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm
|
|
MOVB -1(DX)(R9*1), R10
|
|
MOVB -1(DX)(DI*1), R11
|
|
CMPB R10, R11
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm
|
|
LEAL -1(DI), DI
|
|
DECL R9
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm:
|
|
MOVL DI, R8
|
|
SUBL 12(SP), R8
|
|
LEAQ 4(CX)(R8*1), R8
|
|
CMPQ R8, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), R10
|
|
SUBL R8, R9
|
|
|
|
// emitLiteral
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm
|
|
SUBL $0x1d, R8
|
|
CMPL R8, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm
|
|
CMPL R8, $0x00010000
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm
|
|
MOVL R8, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (CX)
|
|
MOVW R8, 1(CX)
|
|
MOVB R11, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R8
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R8, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R8
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R8, 1(CX)
|
|
ADDL $0x1d, R8
|
|
ADDQ $0x02, CX
|
|
CMPL R8, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm:
|
|
SHLB $0x03, R8
|
|
MOVB R8, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVOU (R10), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R9*1)
|
|
MOVOU X3, -16(CX)(R9*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm:
|
|
MOVQ R8, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm:
|
|
LEAQ (CX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R9*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R9*1)
|
|
MOVOU X3, -16(CX)(R9*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm:
|
|
MOVQ R8, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm:
|
|
LEAQ (CX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R11, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R13*1), R11
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R13*1), X4
|
|
MOVOU -16(R10)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R9, R13
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R9*1)
|
|
MOVOU X3, -16(CX)(R9*1)
|
|
MOVQ R8, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm:
|
|
ADDL $0x05, AX
|
|
MOVL AX, R8
|
|
SUBL 16(SP), R8
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL AX, R9
|
|
LEAQ (DX)(AX*1), R10
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm:
|
|
MOVQ (R10)(R12*1), R11
|
|
MOVQ 8(R10)(R12*1), R13
|
|
XORQ (R8)(R12*1), R11
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm
|
|
XORQ 8(R8)(R12*1), R13
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm
|
|
LEAL -16(R9), R9
|
|
LEAL 16(R12), R12
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm:
|
|
CMPL R9, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R13, R13
|
|
|
|
#else
|
|
BSFQ R13, R13
|
|
|
|
#endif
|
|
SARQ $0x03, R13
|
|
LEAL 8(R12)(R13*1), R12
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm:
|
|
CMPL R9, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm
|
|
MOVQ (R10)(R12*1), R11
|
|
XORQ (R8)(R12*1), R11
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R12), R12
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm:
|
|
CMPL R9, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm
|
|
MOVL (R10)(R12*1), R11
|
|
CMPL (R8)(R12*1), R11
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm
|
|
LEAL -4(R9), R9
|
|
LEAL 4(R12), R12
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm:
|
|
CMPL R9, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm
|
|
MOVW (R10)(R12*1), R11
|
|
CMPW (R8)(R12*1), R11
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm
|
|
LEAL 2(R12), R12
|
|
SUBL $0x02, R9
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm:
|
|
MOVB (R10)(R12*1), R11
|
|
CMPB (R8)(R12*1), R11
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm
|
|
LEAL 1(R12), R12
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm:
|
|
ADDL R12, AX
|
|
MOVL AX, R8
|
|
SUBL DI, R8
|
|
MOVL 16(SP), DI
|
|
|
|
// emitRepeat
|
|
LEAL -1(R8), DI
|
|
CMPL R8, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm
|
|
LEAL -30(R8), DI
|
|
CMPL R8, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm
|
|
CMPL R8, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL DI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW DI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB DI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(R8*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm
|
|
|
|
no_repeat_found_encodeBetterBlockAsm:
|
|
CMPL R8, R12
|
|
JLE offset_ok_2_encodeBetterBlockAsm
|
|
CMPL BX, R9
|
|
JEQ candidate_match_encodeBetterBlockAsm
|
|
|
|
offset_ok_2_encodeBetterBlockAsm:
|
|
CMPL R10, R12
|
|
JLE offset_ok_3_encodeBetterBlockAsm
|
|
CMPL SI, R9
|
|
JEQ candidateS_match_encodeBetterBlockAsm
|
|
|
|
offset_ok_3_encodeBetterBlockAsm:
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm
|
|
|
|
candidateS_match_encodeBetterBlockAsm:
|
|
SHRQ $0x08, R9
|
|
MOVQ R9, R13
|
|
SHLQ $0x08, R13
|
|
IMULQ R11, R13
|
|
SHRQ $0x2f, R13
|
|
MOVL (DI)(R13*4), R8
|
|
INCL AX
|
|
MOVL AX, (DI)(R13*4)
|
|
CMPL R8, R12
|
|
JLE offset_ok_4_encodeBetterBlockAsm
|
|
CMPL (DX)(R8*1), R9
|
|
JEQ candidate_match_encodeBetterBlockAsm
|
|
|
|
offset_ok_4_encodeBetterBlockAsm:
|
|
DECL AX
|
|
MOVL R10, R8
|
|
|
|
candidate_match_encodeBetterBlockAsm:
|
|
MOVL 12(SP), DI
|
|
TESTL R8, R8
|
|
JZ match_extend_back_end_encodeBetterBlockAsm
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm:
|
|
CMPL AX, DI
|
|
JBE match_extend_back_end_encodeBetterBlockAsm
|
|
MOVB -1(DX)(R8*1), R9
|
|
MOVB -1(DX)(AX*1), R10
|
|
CMPB R9, R10
|
|
JNE match_extend_back_end_encodeBetterBlockAsm
|
|
LEAL -1(AX), AX
|
|
DECL R8
|
|
JZ match_extend_back_end_encodeBetterBlockAsm
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm:
|
|
MOVL AX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 4(CX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm:
|
|
MOVL AX, DI
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, R8
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL AX, R9
|
|
LEAQ (DX)(AX*1), R10
|
|
LEAQ (DX)(R8*1), R11
|
|
|
|
// matchLen
|
|
XORL R13, R13
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
|
|
MOVQ (R10)(R13*1), R12
|
|
MOVQ 8(R10)(R13*1), R14
|
|
XORQ (R11)(R13*1), R12
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
|
|
XORQ 8(R11)(R13*1), R14
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm
|
|
LEAL -16(R9), R9
|
|
LEAL 16(R13), R13
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R9, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R14, R14
|
|
|
|
#else
|
|
BSFQ R14, R14
|
|
|
|
#endif
|
|
SARQ $0x03, R14
|
|
LEAL 8(R13)(R14*1), R13
|
|
JMP match_nolit_end_encodeBetterBlockAsm
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R9, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm
|
|
MOVQ (R10)(R13*1), R12
|
|
XORQ (R11)(R13*1), R12
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R13), R13
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL (R13)(R12*1), R13
|
|
JMP match_nolit_end_encodeBetterBlockAsm
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R9, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm
|
|
MOVL (R10)(R13*1), R12
|
|
CMPL (R11)(R13*1), R12
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
|
|
LEAL -4(R9), R9
|
|
LEAL 4(R13), R13
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R9, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm
|
|
JB match_nolit_end_encodeBetterBlockAsm
|
|
MOVW (R10)(R13*1), R12
|
|
CMPW (R11)(R13*1), R12
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
|
|
LEAL 2(R13), R13
|
|
SUBL $0x02, R9
|
|
JZ match_nolit_end_encodeBetterBlockAsm
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm:
|
|
MOVB (R10)(R13*1), R12
|
|
CMPB (R11)(R13*1), R12
|
|
JNE match_nolit_end_encodeBetterBlockAsm
|
|
LEAL 1(R13), R13
|
|
|
|
match_nolit_end_encodeBetterBlockAsm:
|
|
MOVL AX, R9
|
|
SUBL R8, R9
|
|
CMPL R13, $0x01
|
|
JA match_length_ok_encodeBetterBlockAsm
|
|
CMPL R9, $0x0001003f
|
|
JBE match_length_ok_encodeBetterBlockAsm
|
|
MOVL 20(SP), AX
|
|
INCL AX
|
|
JMP search_loop_encodeBetterBlockAsm
|
|
|
|
match_length_ok_encodeBetterBlockAsm:
|
|
MOVL R9, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R10
|
|
MOVL DI, R8
|
|
SUBL R10, R8
|
|
JZ match_emit_nolits_encodeBetterBlockAsm
|
|
CMPL R9, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm
|
|
CMPL R9, $0x0001003f
|
|
JA match_emit_copy3_encodeBetterBlockAsm
|
|
CMPL R8, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm
|
|
MOVL (DX)(R10*1), R10
|
|
ADDL R13, AX
|
|
ADDL $0x04, R13
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R11, R11
|
|
SUBL $0x40, R9
|
|
LEAL -11(R13), R12
|
|
LEAL -4(R13), R13
|
|
MOVW R9, 1(CX)
|
|
CMPL R13, $0x07
|
|
CMOVLGE R12, R11
|
|
MOVQ $0x00000007, R9
|
|
CMOVLLT R13, R9
|
|
LEAL -1(R8)(R9*4), R9
|
|
MOVL $0x00000003, R12
|
|
LEAL (R12)(R9*8), R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R10, (CX)
|
|
ADDQ R8, CX
|
|
TESTL R11, R11
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), R8
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm
|
|
LEAL -30(R11), R8
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL R8, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW R8, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB R8, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm:
|
|
XORL R8, R8
|
|
LEAL -4(R8)(R11*8), R8
|
|
MOVB R8, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
match_emit_copy3_encodeBetterBlockAsm:
|
|
CMPL R8, $0x03
|
|
JA match_emit_lits_encodeBetterBlockAsm
|
|
MOVLQZX 12(SP), R10
|
|
MOVL (DX)(R10*1), R10
|
|
ADDL R13, AX
|
|
ADDL $0x04, R13
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy3
|
|
LEAL -4(R13), R13
|
|
LEAL -65536(R9), R9
|
|
SHLL $0x0b, R9
|
|
LEAL 7(R9)(R8*8), R9
|
|
CMPL R13, $0x3c
|
|
JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm
|
|
LEAL -60(R13), R11
|
|
CMPL R13, $0x0000013c
|
|
JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm
|
|
CMPL R13, $0x0001003c
|
|
JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm
|
|
ADDL $0x000007e0, R9
|
|
MOVL R9, (CX)
|
|
MOVL R11, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm
|
|
|
|
emit_copy3_2_match_emit_lits_encodeBetterBlockAsm:
|
|
ADDL $0x000007c0, R9
|
|
MOVL R9, (CX)
|
|
MOVW R11, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm
|
|
|
|
emit_copy3_1_match_emit_lits_encodeBetterBlockAsm:
|
|
ADDL $0x000007a0, R9
|
|
MOVL R9, (CX)
|
|
MOVB R11, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm
|
|
|
|
emit_copy3_0_match_emit_lits_encodeBetterBlockAsm:
|
|
SHLL $0x05, R13
|
|
ORL R13, R9
|
|
MOVL R9, (CX)
|
|
ADDQ $0x04, CX
|
|
|
|
match_emit_copy_litsencodeBetterBlockAsm:
|
|
MOVL R10, (CX)
|
|
ADDQ R8, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
match_emit_lits_encodeBetterBlockAsm:
|
|
LEAQ (DX)(R10*1), R10
|
|
|
|
// emitLiteral
|
|
LEAL -1(R8), R11
|
|
CMPL R11, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm
|
|
SUBL $0x1d, R11
|
|
CMPL R11, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm
|
|
CMPL R11, $0x00010000
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm
|
|
MOVL R11, R12
|
|
SHRL $0x10, R12
|
|
MOVB $0xf8, (CX)
|
|
MOVW R11, 1(CX)
|
|
MOVB R12, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R11
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R11, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R11
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R11, 1(CX)
|
|
ADDL $0x1d, R11
|
|
ADDQ $0x02, CX
|
|
CMPL R11, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm:
|
|
SHLB $0x03, R11
|
|
MOVB R11, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R8*1), R11
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVOU (R10), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R8*1), X2
|
|
MOVOU -16(R10)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm:
|
|
MOVQ R11, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm:
|
|
LEAQ (CX)(R8*1), R11
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R8*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R8*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R8*1), X2
|
|
MOVOU -16(R10)(R8*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm:
|
|
MOVQ R11, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm:
|
|
LEAQ (CX)(R8*1), R11
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R8*1), X2
|
|
MOVOU -16(R10)(R8*1), X3
|
|
MOVQ R8, R14
|
|
SHRQ $0x05, R14
|
|
MOVQ CX, R12
|
|
ANDL $0x0000001f, R12
|
|
MOVQ $0x00000040, R15
|
|
SUBQ R12, R15
|
|
DECQ R14
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R15*1), R12
|
|
LEAQ -32(CX)(R15*1), BP
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (R12), X4
|
|
MOVOU 16(R12), X5
|
|
MOVOA X4, (BP)
|
|
MOVOA X5, 16(BP)
|
|
ADDQ $0x20, BP
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R15
|
|
DECQ R14
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R15*1), X4
|
|
MOVOU -16(R10)(R15*1), X5
|
|
MOVOA X4, -32(CX)(R15*1)
|
|
MOVOA X5, -16(CX)(R15*1)
|
|
ADDQ $0x20, R15
|
|
CMPQ R8, R15
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R8*1)
|
|
MOVOU X3, -16(CX)(R8*1)
|
|
MOVQ R11, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm:
|
|
ADDL R13, AX
|
|
ADDL $0x04, R13
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL R9, $0x0001003f
|
|
JBE two_byte_offset_match_nolit_encodeBetterBlockAsm
|
|
|
|
// emitCopy3
|
|
LEAL -4(R13), R13
|
|
LEAL -65536(R9), R8
|
|
SHLL $0x0b, R8
|
|
ADDL $0x07, R8
|
|
CMPL R13, $0x3c
|
|
JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3
|
|
LEAL -60(R13), R9
|
|
CMPL R13, $0x0000013c
|
|
JB emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3
|
|
CMPL R13, $0x0001003c
|
|
JB emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3
|
|
ADDL $0x000007e0, R8
|
|
MOVL R8, (CX)
|
|
MOVL R9, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3:
|
|
ADDL $0x000007c0, R8
|
|
MOVL R8, (CX)
|
|
MOVW R9, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3:
|
|
ADDL $0x000007a0, R8
|
|
MOVL R8, (CX)
|
|
MOVB R9, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3:
|
|
SHLL $0x05, R13
|
|
ORL R13, R8
|
|
MOVL R8, (CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R9, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm
|
|
CMPL R13, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm
|
|
LEAL -1(R9), R8
|
|
SHLL $0x06, R8
|
|
LEAL -15(R8)(R13*4), R8
|
|
MOVW R8, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R13, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm
|
|
LEAL -1(R9), R8
|
|
SHLL $0x06, R8
|
|
LEAL 61(R8), R8
|
|
MOVW R8, (CX)
|
|
LEAL -18(R13), R8
|
|
MOVB R8, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm:
|
|
LEAL -1(R9), R8
|
|
SHLL $0x06, R8
|
|
LEAL 57(R8), R8
|
|
MOVW R8, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R13
|
|
|
|
// emitRepeat
|
|
LEAL -1(R13), R8
|
|
CMPL R13, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm
|
|
LEAL -30(R13), R8
|
|
CMPL R13, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm
|
|
CMPL R13, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL R8, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW R8, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB R8, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm:
|
|
XORL R8, R8
|
|
LEAL -4(R8)(R13*8), R8
|
|
MOVB R8, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm:
|
|
// emitCopy2
|
|
LEAL -64(R9), R9
|
|
LEAL -4(R13), R13
|
|
MOVW R9, 1(CX)
|
|
CMPL R13, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2
|
|
LEAL -60(R13), R8
|
|
CMPL R13, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2
|
|
CMPL R13, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL R8, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW R8, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB R8, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2:
|
|
MOVL $0x00000002, R8
|
|
LEAL (R8)(R13*4), R8
|
|
MOVB R8, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), R10
|
|
SUBL R8, R9
|
|
|
|
// emitLiteral
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm
|
|
SUBL $0x1d, R8
|
|
CMPL R8, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm
|
|
CMPL R8, $0x00010000
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm
|
|
MOVL R8, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (CX)
|
|
MOVW R8, 1(CX)
|
|
MOVB R11, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R8
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R8, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R8
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R8, 1(CX)
|
|
ADDL $0x1d, R8
|
|
ADDQ $0x02, CX
|
|
CMPL R8, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm:
|
|
SHLB $0x03, R8
|
|
MOVB R8, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVOU (R10), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R9*1)
|
|
MOVOU X3, -16(CX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVQ R8, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm:
|
|
LEAQ (CX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(R9*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R9*1)
|
|
MOVOU X3, -16(CX)(R9*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVQ R8, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm:
|
|
LEAQ (CX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(CX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(CX)(R14*1)
|
|
MOVOA X5, -16(CX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(R9*1)
|
|
MOVOU X3, -16(CX)(R9*1)
|
|
MOVQ R8, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
|
|
ADDL R13, AX
|
|
ADDL $0x04, R13
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R13), R8
|
|
CMPL R13, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm
|
|
LEAL -30(R13), R8
|
|
CMPL R13, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm
|
|
CMPL R13, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm
|
|
MOVB $0xfc, (CX)
|
|
MOVL R8, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (CX)
|
|
MOVW R8, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xec, (CX)
|
|
MOVB R8, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm:
|
|
XORL R8, R8
|
|
LEAL -4(R8)(R13*8), R8
|
|
MOVB R8, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm:
|
|
MOVQ tmp+48(FP), R8
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, R10
|
|
LEAQ 1(DI), DI
|
|
LEAQ -2(AX), R11
|
|
MOVQ (DX)(DI*1), R12
|
|
MOVQ 1(DX)(DI*1), R13
|
|
MOVQ (DX)(R11*1), R14
|
|
MOVQ 1(DX)(R11*1), R15
|
|
SHLQ $0x08, R12
|
|
IMULQ R9, R12
|
|
SHRQ $0x2f, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R10, R13
|
|
SHRQ $0x32, R13
|
|
SHLQ $0x08, R14
|
|
IMULQ R9, R14
|
|
SHRQ $0x2f, R14
|
|
SHLQ $0x20, R15
|
|
IMULQ R10, R15
|
|
SHRQ $0x32, R15
|
|
LEAQ 1(DI), R10
|
|
LEAQ 1(R11), BP
|
|
MOVL DI, (R8)(R12*4)
|
|
MOVL R11, (R8)(R14*4)
|
|
LEAQ 1(R11)(DI*1), R12
|
|
SHRQ $0x01, R12
|
|
ADDQ $0x01, DI
|
|
SUBQ $0x01, R11
|
|
MOVL R10, 524288(R8)(R13*4)
|
|
MOVL BP, 524288(R8)(R15*4)
|
|
|
|
index_loop_encodeBetterBlockAsm:
|
|
CMPQ R12, R11
|
|
JAE search_loop_encodeBetterBlockAsm
|
|
MOVQ (DX)(DI*1), R10
|
|
MOVQ (DX)(R12*1), R13
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x2f, R10
|
|
SHLQ $0x08, R13
|
|
IMULQ R9, R13
|
|
SHRQ $0x2f, R13
|
|
MOVL DI, (R8)(R10*4)
|
|
MOVL R11, (R8)(R13*4)
|
|
ADDQ $0x02, DI
|
|
ADDQ $0x02, R12
|
|
JMP index_loop_encodeBetterBlockAsm
|
|
|
|
emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 4(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm
|
|
CMPL DX, $0x00010000
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (CX)
|
|
MOVW DX, 1(CX)
|
|
MOVB BL, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm2MB(dst []byte, src []byte, tmp *[589824]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm2MB(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00001200, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm2MB:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm2MB
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -17(AX), DX
|
|
LEAQ -17(AX), BX
|
|
MOVL BX, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm2MB:
|
|
MOVQ tmp+48(FP), BX
|
|
MOVL AX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x07, SI
|
|
CMPL SI, $0x63
|
|
JBE check_maxskip_ok_encodeBetterBlockAsm2MB
|
|
LEAL 100(AX), SI
|
|
JMP check_maxskip_cont_encodeBetterBlockAsm2MB
|
|
|
|
check_maxskip_ok_encodeBetterBlockAsm2MB:
|
|
LEAL 1(AX)(SI*1), SI
|
|
|
|
check_maxskip_cont_encodeBetterBlockAsm2MB:
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm2MB
|
|
MOVQ (DX)(AX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x2f, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x32, R11
|
|
MOVL (BX)(R10*4), SI
|
|
MOVL 524288(BX)(R11*4), R8
|
|
MOVL AX, (BX)(R10*4)
|
|
MOVL AX, 524288(BX)(R11*4)
|
|
MOVQ (DX)(SI*1), R10
|
|
CMPQ R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm2MB
|
|
MOVQ (DX)(R8*1), R11
|
|
CMPQ R11, DI
|
|
MOVL AX, R12
|
|
SUBL 16(SP), R12
|
|
MOVQ (DX)(R12*1), R12
|
|
MOVQ $0x000000ffffffff00, R13
|
|
XORQ DI, R12
|
|
TESTQ R13, R12
|
|
JNE no_repeat_found_encodeBetterBlockAsm2MB
|
|
LEAL 1(AX), BX
|
|
MOVL 12(SP), SI
|
|
MOVL BX, DI
|
|
SUBL 16(SP), DI
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm2MB:
|
|
CMPL BX, SI
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm2MB
|
|
MOVB -1(DX)(DI*1), R8
|
|
MOVB -1(DX)(BX*1), R9
|
|
CMPB R8, R9
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm2MB
|
|
LEAL -1(BX), BX
|
|
DECL DI
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm2MB
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm2MB:
|
|
MOVL BX, SI
|
|
SUBL 12(SP), SI
|
|
LEAQ 4(CX)(SI*1), SI
|
|
CMPQ SI, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm2MB
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm2MB:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm2MB
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm2MB
|
|
CMPL SI, $0x00010000
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm2MB
|
|
MOVL SI, R9
|
|
SHRL $0x10, R9
|
|
MOVB $0xf8, (CX)
|
|
MOVW SI, 1(CX)
|
|
MOVB R9, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm2MB
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm2MB:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm2MB:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R11
|
|
SUBQ R9, R11
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), R9
|
|
LEAQ -32(CX)(R11*1), R12
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(CX)(R11*1)
|
|
MOVOA X5, -16(CX)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DI, R11
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB:
|
|
ADDL $0x05, AX
|
|
MOVL AX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB:
|
|
MOVQ (R8)(R10*1), R9
|
|
MOVQ 8(R8)(R10*1), R11
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB
|
|
XORQ 8(SI)(R10*1), R11
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R10), R10
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL 8(R10)(R11*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB
|
|
MOVL (R8)(R10*1), R9
|
|
CMPL (SI)(R10*1), R9
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R10), R10
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm2MB
|
|
MOVW (R8)(R10*1), R9
|
|
CMPW (SI)(R10*1), R9
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB
|
|
LEAL 2(R10), R10
|
|
SUBL $0x02, DI
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm2MB
|
|
LEAL 1(R10), R10
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm2MB:
|
|
ADDL R10, AX
|
|
MOVL AX, SI
|
|
SUBL BX, SI
|
|
MOVL 16(SP), BX
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), BX
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm2MB
|
|
LEAL -30(SI), BX
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm2MB
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL BX, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm2MB
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm2MB
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm2MB
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm2MB:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(SI*8), BX
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm2MB:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm2MB
|
|
|
|
no_repeat_found_encodeBetterBlockAsm2MB:
|
|
CMPL R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm2MB
|
|
CMPL R11, DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm2MB
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm2MB
|
|
|
|
candidateS_match_encodeBetterBlockAsm2MB:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x2f, R10
|
|
MOVL (BX)(R10*4), SI
|
|
INCL AX
|
|
MOVL AX, (BX)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm2MB
|
|
DECL AX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm2MB:
|
|
MOVL 12(SP), BX
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm2MB
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm2MB:
|
|
CMPL AX, BX
|
|
JBE match_extend_back_end_encodeBetterBlockAsm2MB
|
|
MOVB -1(DX)(SI*1), DI
|
|
MOVB -1(DX)(AX*1), R8
|
|
CMPB DI, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm2MB
|
|
LEAL -1(AX), AX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm2MB
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm2MB
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm2MB:
|
|
MOVL AX, BX
|
|
SUBL 12(SP), BX
|
|
LEAQ 4(CX)(BX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm2MB
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm2MB:
|
|
MOVL AX, BX
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), R9
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB:
|
|
MOVQ (R8)(R11*1), R10
|
|
MOVQ 8(R8)(R11*1), R12
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB
|
|
XORQ 8(R9)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm2MB
|
|
MOVQ (R8)(R11*1), R10
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm2MB
|
|
MOVL (R8)(R11*1), R10
|
|
CMPL (R9)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm2MB
|
|
JB match_nolit_end_encodeBetterBlockAsm2MB
|
|
MOVW (R8)(R11*1), R10
|
|
CMPW (R9)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBetterBlockAsm2MB
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm2MB:
|
|
MOVB (R8)(R11*1), R10
|
|
CMPB (R9)(R11*1), R10
|
|
JNE match_nolit_end_encodeBetterBlockAsm2MB
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBetterBlockAsm2MB:
|
|
MOVL AX, DI
|
|
SUBL SI, DI
|
|
CMPL R11, $0x01
|
|
JA match_length_ok_encodeBetterBlockAsm2MB
|
|
CMPL DI, $0x0001003f
|
|
JBE match_length_ok_encodeBetterBlockAsm2MB
|
|
MOVL 20(SP), AX
|
|
INCL AX
|
|
JMP search_loop_encodeBetterBlockAsm2MB
|
|
|
|
match_length_ok_encodeBetterBlockAsm2MB:
|
|
MOVL DI, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R8
|
|
MOVL BX, SI
|
|
SUBL R8, SI
|
|
JZ match_emit_nolits_encodeBetterBlockAsm2MB
|
|
CMPL DI, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm2MB
|
|
CMPL DI, $0x0001003f
|
|
JA match_emit_copy3_encodeBetterBlockAsm2MB
|
|
CMPL SI, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm2MB
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, DI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, DI
|
|
CMOVLLT R11, DI
|
|
LEAL -1(SI)(DI*4), DI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(DI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm2MB
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm2MB
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm2MB:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
match_emit_copy3_encodeBetterBlockAsm2MB:
|
|
CMPL SI, $0x03
|
|
JA match_emit_lits_encodeBetterBlockAsm2MB
|
|
MOVLQZX 12(SP), R8
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(DI), DI
|
|
SHLL $0x0b, DI
|
|
LEAL 7(DI)(SI*8), DI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm2MB
|
|
LEAL -60(R11), R9
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm2MB
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm2MB
|
|
ADDL $0x000007e0, DI
|
|
MOVL DI, (CX)
|
|
MOVL R9, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm2MB
|
|
|
|
emit_copy3_2_match_emit_lits_encodeBetterBlockAsm2MB:
|
|
ADDL $0x000007c0, DI
|
|
MOVL DI, (CX)
|
|
MOVW R9, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm2MB
|
|
|
|
emit_copy3_1_match_emit_lits_encodeBetterBlockAsm2MB:
|
|
ADDL $0x000007a0, DI
|
|
MOVL DI, (CX)
|
|
MOVB R9, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm2MB
|
|
|
|
emit_copy3_0_match_emit_lits_encodeBetterBlockAsm2MB:
|
|
SHLL $0x05, R11
|
|
ORL R11, DI
|
|
MOVL DI, (CX)
|
|
ADDQ $0x04, CX
|
|
|
|
match_emit_copy_litsencodeBetterBlockAsm2MB:
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
match_emit_lits_encodeBetterBlockAsm2MB:
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm2MB
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm2MB
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm2MB
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm2MB
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm2MB
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm2MB
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm2MB
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm2MB:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm2MB:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm2MB
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm2MB:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm2MB
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ SI, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm2MB:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x0001003f
|
|
JBE two_byte_offset_match_nolit_encodeBetterBlockAsm2MB
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(DI), SI
|
|
SHLL $0x0b, SI
|
|
ADDL $0x07, SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm2MB_emit3
|
|
LEAL -60(R11), DI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_nolit_encodeBetterBlockAsm2MB_emit3
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_nolit_encodeBetterBlockAsm2MB_emit3
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL DI, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy3_2_match_nolit_encodeBetterBlockAsm2MB_emit3:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW DI, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy3_1_match_nolit_encodeBetterBlockAsm2MB_emit3:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB DI, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy3_0_match_nolit_encodeBetterBlockAsm2MB_emit3:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm2MB:
|
|
CMPL DI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm2MB
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm2MB:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm2MB:
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm2MB:
|
|
// emitCopy2
|
|
LEAL -64(DI), DI
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm2MB_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm2MB_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm2MB_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm2MB_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm2MB_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm2MB_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
CMPL SI, $0x00010000
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
MOVL SI, R9
|
|
SHRL $0x10, R9
|
|
MOVB $0xf8, (CX)
|
|
MOVW SI, 1(CX)
|
|
MOVB R9, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm2MB
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 15, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R9, R12
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R9
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R12
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DI, R12
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm2MB
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm2MB
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm2MB
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm2MB:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm2MB:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm2MB:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm2MB
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm2MB
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm2MB:
|
|
MOVQ tmp+48(FP), SI
|
|
MOVQ $0x00cf1bbcdcbfa563, DI
|
|
MOVQ $0x9e3779b1, R8
|
|
LEAQ 1(BX), BX
|
|
LEAQ -2(AX), R9
|
|
MOVQ (DX)(BX*1), R10
|
|
MOVQ 1(DX)(BX*1), R11
|
|
MOVQ (DX)(R9*1), R12
|
|
MOVQ 1(DX)(R9*1), R13
|
|
SHLQ $0x08, R10
|
|
IMULQ DI, R10
|
|
SHRQ $0x2f, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x08, R12
|
|
IMULQ DI, R12
|
|
SHRQ $0x2f, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x32, R13
|
|
LEAQ 1(BX), R8
|
|
LEAQ 1(R9), R14
|
|
MOVL BX, (SI)(R10*4)
|
|
MOVL R9, (SI)(R12*4)
|
|
LEAQ 1(R9)(BX*1), R10
|
|
SHRQ $0x01, R10
|
|
ADDQ $0x01, BX
|
|
SUBQ $0x01, R9
|
|
MOVL R8, 524288(SI)(R11*4)
|
|
MOVL R14, 524288(SI)(R13*4)
|
|
|
|
index_loop_encodeBetterBlockAsm2MB:
|
|
CMPQ R10, R9
|
|
JAE search_loop_encodeBetterBlockAsm2MB
|
|
MOVQ (DX)(BX*1), R8
|
|
MOVQ (DX)(R10*1), R11
|
|
SHLQ $0x08, R8
|
|
IMULQ DI, R8
|
|
SHRQ $0x2f, R8
|
|
SHLQ $0x08, R11
|
|
IMULQ DI, R11
|
|
SHRQ $0x2f, R11
|
|
MOVL BX, (SI)(R8*4)
|
|
MOVL R9, (SI)(R11*4)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, R10
|
|
JMP index_loop_encodeBetterBlockAsm2MB
|
|
|
|
emit_remainder_encodeBetterBlockAsm2MB:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 4(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm2MB
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm2MB:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm2MB
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm2MB
|
|
CMPL DX, $0x00010000
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm2MB
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (CX)
|
|
MOVW DX, 1(CX)
|
|
MOVB BL, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm2MB:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm2MB:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm2MB
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm2MB:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm2MB:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm2MB:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm512K(dst []byte, src []byte, tmp *[294912]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm512K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000900, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm512K:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm512K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -11(AX), DX
|
|
LEAQ -8(AX), BX
|
|
MOVL BX, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm512K:
|
|
MOVQ tmp+48(FP), BX
|
|
MOVL AX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x07, SI
|
|
CMPL SI, $0x63
|
|
JBE check_maxskip_ok_encodeBetterBlockAsm512K
|
|
LEAL 100(AX), SI
|
|
JMP check_maxskip_cont_encodeBetterBlockAsm512K
|
|
|
|
check_maxskip_ok_encodeBetterBlockAsm512K:
|
|
LEAL 1(AX)(SI*1), SI
|
|
|
|
check_maxskip_cont_encodeBetterBlockAsm512K:
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm512K
|
|
MOVQ (DX)(AX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x33, R11
|
|
MOVL (BX)(R10*4), SI
|
|
MOVL 262144(BX)(R11*4), R8
|
|
MOVL AX, (BX)(R10*4)
|
|
MOVL AX, 262144(BX)(R11*4)
|
|
MOVQ (DX)(SI*1), R10
|
|
CMPQ R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm512K
|
|
MOVQ (DX)(R8*1), R11
|
|
CMPQ R11, DI
|
|
MOVL AX, R12
|
|
SUBL 16(SP), R12
|
|
MOVQ (DX)(R12*1), R12
|
|
MOVQ $0x000000ffffffff00, R13
|
|
XORQ DI, R12
|
|
TESTQ R13, R12
|
|
JNE no_repeat_found_encodeBetterBlockAsm512K
|
|
LEAL 1(AX), BX
|
|
MOVL 12(SP), SI
|
|
MOVL BX, DI
|
|
SUBL 16(SP), DI
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm512K:
|
|
CMPL BX, SI
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm512K
|
|
MOVB -1(DX)(DI*1), R8
|
|
MOVB -1(DX)(BX*1), R9
|
|
CMPB R8, R9
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm512K
|
|
LEAL -1(BX), BX
|
|
DECL DI
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm512K
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm512K:
|
|
MOVL BX, SI
|
|
SUBL 12(SP), SI
|
|
LEAQ 4(CX)(SI*1), SI
|
|
CMPQ SI, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm512K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm512K:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm512K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm512K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm512K
|
|
CMPL SI, $0x00010000
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm512K
|
|
MOVL SI, R9
|
|
SHRL $0x10, R9
|
|
MOVB $0xf8, (CX)
|
|
MOVW SI, 1(CX)
|
|
MOVB R9, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm512K
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm512K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm512K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm512K
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R11
|
|
SUBQ R9, R11
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), R9
|
|
LEAQ -32(CX)(R11*1), R12
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(CX)(R11*1)
|
|
MOVOA X5, -16(CX)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DI, R11
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm512K:
|
|
ADDL $0x05, AX
|
|
MOVL AX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K:
|
|
MOVQ (R8)(R10*1), R9
|
|
MOVQ 8(R8)(R10*1), R11
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K
|
|
XORQ 8(SI)(R10*1), R11
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R10), R10
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm512K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL 8(R10)(R11*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm512K
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm512K
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm512K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm512K
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm512K
|
|
MOVL (R8)(R10*1), R9
|
|
CMPL (SI)(R10*1), R9
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm512K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R10), R10
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm512K
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm512K
|
|
MOVW (R8)(R10*1), R9
|
|
CMPW (SI)(R10*1), R9
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm512K
|
|
LEAL 2(R10), R10
|
|
SUBL $0x02, DI
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm512K
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm512K:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm512K
|
|
LEAL 1(R10), R10
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm512K:
|
|
ADDL R10, AX
|
|
MOVL AX, SI
|
|
SUBL BX, SI
|
|
MOVL 16(SP), BX
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), BX
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm512K
|
|
LEAL -30(SI), BX
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm512K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL BX, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm512K
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm512K
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm512K
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm512K:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(SI*8), BX
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm512K:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm512K
|
|
|
|
no_repeat_found_encodeBetterBlockAsm512K:
|
|
CMPL R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm512K
|
|
CMPL R11, DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm512K
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm512K
|
|
|
|
candidateS_match_encodeBetterBlockAsm512K:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
MOVL (BX)(R10*4), SI
|
|
INCL AX
|
|
MOVL AX, (BX)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm512K
|
|
DECL AX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm512K:
|
|
MOVL 12(SP), BX
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm512K
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm512K:
|
|
CMPL AX, BX
|
|
JBE match_extend_back_end_encodeBetterBlockAsm512K
|
|
MOVB -1(DX)(SI*1), DI
|
|
MOVB -1(DX)(AX*1), R8
|
|
CMPB DI, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm512K
|
|
LEAL -1(AX), AX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm512K
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm512K
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm512K:
|
|
MOVL AX, BX
|
|
SUBL 12(SP), BX
|
|
LEAQ 4(CX)(BX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm512K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm512K:
|
|
MOVL AX, BX
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), R9
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K:
|
|
MOVQ (R8)(R11*1), R10
|
|
MOVQ 8(R8)(R11*1), R12
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K
|
|
XORQ 8(R9)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm512K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm512K
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm512K
|
|
MOVQ (R8)(R11*1), R10
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm512K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm512K
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm512K
|
|
MOVL (R8)(R11*1), R10
|
|
CMPL (R9)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm512K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm512K
|
|
JB match_nolit_end_encodeBetterBlockAsm512K
|
|
MOVW (R8)(R11*1), R10
|
|
CMPW (R9)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm512K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBetterBlockAsm512K
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm512K:
|
|
MOVB (R8)(R11*1), R10
|
|
CMPB (R9)(R11*1), R10
|
|
JNE match_nolit_end_encodeBetterBlockAsm512K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBetterBlockAsm512K:
|
|
MOVL AX, DI
|
|
SUBL SI, DI
|
|
CMPL R11, $0x01
|
|
JA match_length_ok_encodeBetterBlockAsm512K
|
|
CMPL DI, $0x0001003f
|
|
JBE match_length_ok_encodeBetterBlockAsm512K
|
|
MOVL 20(SP), AX
|
|
INCL AX
|
|
JMP search_loop_encodeBetterBlockAsm512K
|
|
|
|
match_length_ok_encodeBetterBlockAsm512K:
|
|
MOVL DI, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R8
|
|
MOVL BX, SI
|
|
SUBL R8, SI
|
|
JZ match_emit_nolits_encodeBetterBlockAsm512K
|
|
CMPL DI, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm512K
|
|
CMPL DI, $0x0001003f
|
|
JA match_emit_copy3_encodeBetterBlockAsm512K
|
|
CMPL SI, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm512K
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, DI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, DI
|
|
CMOVLLT R11, DI
|
|
LEAL -1(SI)(DI*4), DI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(DI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm512K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm512K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm512K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
match_emit_copy3_encodeBetterBlockAsm512K:
|
|
CMPL SI, $0x03
|
|
JA match_emit_lits_encodeBetterBlockAsm512K
|
|
MOVLQZX 12(SP), R8
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(DI), DI
|
|
SHLL $0x0b, DI
|
|
LEAL 7(DI)(SI*8), DI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm512K
|
|
LEAL -60(R11), R9
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm512K
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm512K
|
|
ADDL $0x000007e0, DI
|
|
MOVL DI, (CX)
|
|
MOVL R9, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm512K
|
|
|
|
emit_copy3_2_match_emit_lits_encodeBetterBlockAsm512K:
|
|
ADDL $0x000007c0, DI
|
|
MOVL DI, (CX)
|
|
MOVW R9, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm512K
|
|
|
|
emit_copy3_1_match_emit_lits_encodeBetterBlockAsm512K:
|
|
ADDL $0x000007a0, DI
|
|
MOVL DI, (CX)
|
|
MOVB R9, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_emit_copy_litsencodeBetterBlockAsm512K
|
|
|
|
emit_copy3_0_match_emit_lits_encodeBetterBlockAsm512K:
|
|
SHLL $0x05, R11
|
|
ORL R11, DI
|
|
MOVL DI, (CX)
|
|
ADDQ $0x04, CX
|
|
|
|
match_emit_copy_litsencodeBetterBlockAsm512K:
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
match_emit_lits_encodeBetterBlockAsm512K:
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm512K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm512K
|
|
CMPL R9, $0x00010000
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm512K
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm512K
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm512K
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm512K
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm512K
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm512K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ SI, $0x08
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8:
|
|
MOVQ (R8), R10
|
|
MOVQ R10, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16:
|
|
MOVQ (R8), R10
|
|
MOVQ -8(R8)(SI*1), R8
|
|
MOVQ R10, (CX)
|
|
MOVQ R8, -8(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm512K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm512K
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm512K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm512K
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ SI, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm512K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x0001003f
|
|
JBE two_byte_offset_match_nolit_encodeBetterBlockAsm512K
|
|
|
|
// emitCopy3
|
|
LEAL -4(R11), R11
|
|
LEAL -65536(DI), SI
|
|
SHLL $0x0b, SI
|
|
ADDL $0x07, SI
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm512K_emit3
|
|
LEAL -60(R11), DI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy3_1_match_nolit_encodeBetterBlockAsm512K_emit3
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy3_2_match_nolit_encodeBetterBlockAsm512K_emit3
|
|
ADDL $0x000007e0, SI
|
|
MOVL SI, (CX)
|
|
MOVL DI, 4(CX)
|
|
ADDQ $0x07, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy3_2_match_nolit_encodeBetterBlockAsm512K_emit3:
|
|
ADDL $0x000007c0, SI
|
|
MOVL SI, (CX)
|
|
MOVW DI, 4(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy3_1_match_nolit_encodeBetterBlockAsm512K_emit3:
|
|
ADDL $0x000007a0, SI
|
|
MOVL SI, (CX)
|
|
MOVB DI, 4(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy3_0_match_nolit_encodeBetterBlockAsm512K_emit3:
|
|
SHLL $0x05, R11
|
|
ORL R11, SI
|
|
MOVL SI, (CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm512K:
|
|
CMPL DI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm512K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm512K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm512K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm512K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm512K:
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm512K:
|
|
// emitCopy2
|
|
LEAL -64(DI), DI
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm512K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm512K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm512K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm512K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm512K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm512K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm512K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm512K
|
|
CMPL SI, $0x00010000
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm512K
|
|
MOVL SI, R9
|
|
SHRL $0x10, R9
|
|
MOVB $0xf8, (CX)
|
|
MOVW SI, 1(CX)
|
|
MOVB R9, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm512K
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R9, R12
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R9
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R12
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DI, R12
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm512K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm512K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm512K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm512K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm512K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm512K
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm512K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm512K:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm512K
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm512K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm512K:
|
|
MOVQ tmp+48(FP), SI
|
|
MOVQ $0x00cf1bbcdcbfa563, DI
|
|
MOVQ $0x9e3779b1, R8
|
|
LEAQ 1(BX), BX
|
|
LEAQ -2(AX), R9
|
|
MOVQ (DX)(BX*1), R10
|
|
MOVQ 1(DX)(BX*1), R11
|
|
MOVQ (DX)(R9*1), R12
|
|
MOVQ 1(DX)(R9*1), R13
|
|
SHLQ $0x08, R10
|
|
IMULQ DI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x33, R11
|
|
SHLQ $0x08, R12
|
|
IMULQ DI, R12
|
|
SHRQ $0x30, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x33, R13
|
|
LEAQ 1(BX), R8
|
|
LEAQ 1(R9), R14
|
|
MOVL BX, (SI)(R10*4)
|
|
MOVL R9, (SI)(R12*4)
|
|
LEAQ 1(R9)(BX*1), R10
|
|
SHRQ $0x01, R10
|
|
ADDQ $0x01, BX
|
|
SUBQ $0x01, R9
|
|
MOVL R8, 262144(SI)(R11*4)
|
|
MOVL R14, 262144(SI)(R13*4)
|
|
|
|
index_loop_encodeBetterBlockAsm512K:
|
|
CMPQ R10, R9
|
|
JAE search_loop_encodeBetterBlockAsm512K
|
|
MOVQ (DX)(BX*1), R8
|
|
MOVQ (DX)(R10*1), R11
|
|
SHLQ $0x08, R8
|
|
IMULQ DI, R8
|
|
SHRQ $0x30, R8
|
|
SHLQ $0x08, R11
|
|
IMULQ DI, R11
|
|
SHRQ $0x30, R11
|
|
MOVL BX, (SI)(R8*4)
|
|
MOVL R9, (SI)(R11*4)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, R10
|
|
JMP index_loop_encodeBetterBlockAsm512K
|
|
|
|
emit_remainder_encodeBetterBlockAsm512K:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 4(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm512K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm512K:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm512K
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm512K
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm512K
|
|
CMPL DX, $0x00010000
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm512K
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (CX)
|
|
MOVW DX, 1(CX)
|
|
MOVB BL, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm512K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm512K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm512K
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm512K:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm512K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm512K
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm512K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm512K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm64K(dst []byte, src []byte, tmp *[73728]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm64K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000240, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm64K:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm64K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -11(AX), DX
|
|
LEAQ -8(AX), BX
|
|
MOVL BX, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm64K:
|
|
MOVQ tmp+48(FP), BX
|
|
MOVL AX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 1(AX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm64K
|
|
MOVQ (DX)(AX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x31, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x34, R11
|
|
MOVWLZX (BX)(R10*2), SI
|
|
MOVWLZX 65536(BX)(R11*2), R8
|
|
MOVW AX, (BX)(R10*2)
|
|
MOVW AX, 65536(BX)(R11*2)
|
|
MOVQ (DX)(SI*1), R10
|
|
CMPQ R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm64K
|
|
MOVQ (DX)(R8*1), R11
|
|
CMPQ R11, DI
|
|
MOVL AX, R12
|
|
SUBL 16(SP), R12
|
|
MOVQ (DX)(R12*1), R12
|
|
MOVQ $0x000000ffffffff00, R13
|
|
XORQ DI, R12
|
|
TESTQ R13, R12
|
|
JNE no_repeat_found_encodeBetterBlockAsm64K
|
|
LEAL 1(AX), BX
|
|
MOVL 12(SP), SI
|
|
MOVL BX, DI
|
|
SUBL 16(SP), DI
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm64K:
|
|
CMPL BX, SI
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm64K
|
|
MOVB -1(DX)(DI*1), R8
|
|
MOVB -1(DX)(BX*1), R9
|
|
CMPB R8, R9
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm64K
|
|
LEAL -1(BX), BX
|
|
DECL DI
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm64K
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm64K:
|
|
MOVL BX, SI
|
|
SUBL 12(SP), SI
|
|
LEAQ 4(CX)(SI*1), SI
|
|
CMPQ SI, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm64K:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm64K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm64K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm64K
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm64K
|
|
MOVL SI, R9
|
|
SHRL $0x10, R9
|
|
MOVB $0xf8, (CX)
|
|
MOVW SI, 1(CX)
|
|
MOVB R9, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm64K
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm64K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm64K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm64K
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R11
|
|
SUBQ R9, R11
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), R9
|
|
LEAQ -32(CX)(R11*1), R12
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(CX)(R11*1)
|
|
MOVOA X5, -16(CX)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DI, R11
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm64K:
|
|
ADDL $0x05, AX
|
|
MOVL AX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K:
|
|
MOVQ (R8)(R10*1), R9
|
|
MOVQ 8(R8)(R10*1), R11
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K
|
|
XORQ 8(SI)(R10*1), R11
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R10), R10
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm64K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL 8(R10)(R11*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm64K
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm64K
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm64K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm64K
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm64K
|
|
MOVL (R8)(R10*1), R9
|
|
CMPL (SI)(R10*1), R9
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm64K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R10), R10
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm64K
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm64K
|
|
MOVW (R8)(R10*1), R9
|
|
CMPW (SI)(R10*1), R9
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm64K
|
|
LEAL 2(R10), R10
|
|
SUBL $0x02, DI
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm64K
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm64K:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm64K
|
|
LEAL 1(R10), R10
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm64K:
|
|
ADDL R10, AX
|
|
MOVL AX, SI
|
|
SUBL BX, SI
|
|
MOVL 16(SP), BX
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), BX
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm64K
|
|
LEAL -30(SI), BX
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm64K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL BX, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm64K
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm64K
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm64K
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm64K:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(SI*8), BX
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm64K:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm64K
|
|
|
|
no_repeat_found_encodeBetterBlockAsm64K:
|
|
CMPL R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm64K
|
|
CMPL R11, DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm64K
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm64K
|
|
|
|
candidateS_match_encodeBetterBlockAsm64K:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x31, R10
|
|
MOVWLZX (BX)(R10*2), SI
|
|
INCL AX
|
|
MOVW AX, (BX)(R10*2)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm64K
|
|
DECL AX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm64K:
|
|
MOVL 12(SP), BX
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm64K
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm64K:
|
|
CMPL AX, BX
|
|
JBE match_extend_back_end_encodeBetterBlockAsm64K
|
|
MOVB -1(DX)(SI*1), DI
|
|
MOVB -1(DX)(AX*1), R8
|
|
CMPB DI, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm64K
|
|
LEAL -1(AX), AX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm64K
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm64K
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm64K:
|
|
MOVL AX, BX
|
|
SUBL 12(SP), BX
|
|
LEAQ 4(CX)(BX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm64K:
|
|
MOVL AX, BX
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), R9
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K:
|
|
MOVQ (R8)(R11*1), R10
|
|
MOVQ 8(R8)(R11*1), R12
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K
|
|
XORQ 8(R9)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm64K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm64K
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm64K
|
|
MOVQ (R8)(R11*1), R10
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm64K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm64K
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm64K
|
|
MOVL (R8)(R11*1), R10
|
|
CMPL (R9)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm64K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm64K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm64K
|
|
JB match_nolit_end_encodeBetterBlockAsm64K
|
|
MOVW (R8)(R11*1), R10
|
|
CMPW (R9)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm64K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBetterBlockAsm64K
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm64K:
|
|
MOVB (R8)(R11*1), R10
|
|
CMPB (R9)(R11*1), R10
|
|
JNE match_nolit_end_encodeBetterBlockAsm64K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBetterBlockAsm64K:
|
|
MOVL AX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R8
|
|
MOVL BX, SI
|
|
SUBL R8, SI
|
|
JZ match_emit_nolits_encodeBetterBlockAsm64K
|
|
CMPL DI, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm64K
|
|
CMPL SI, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm64K
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, DI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, DI
|
|
CMOVLLT R11, DI
|
|
LEAL -1(SI)(DI*4), DI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(DI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm64K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm64K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm64K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
match_emit_lits_encodeBetterBlockAsm64K:
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm64K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm64K
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm64K
|
|
MOVL R9, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (CX)
|
|
MOVW R9, 1(CX)
|
|
MOVB R10, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm64K
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm64K
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm64K
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm64K
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm64K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ SI, $0x08
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8:
|
|
MOVQ (R8), R10
|
|
MOVQ R10, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (R8), R10
|
|
MOVQ -8(R8)(SI*1), R8
|
|
MOVQ R10, (CX)
|
|
MOVQ R8, -8(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm64K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm64K
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm64K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm64K
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ SI, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm64K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm64K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm64K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm64K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm64K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm64K:
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm64K:
|
|
// emitCopy2
|
|
LEAL -64(DI), DI
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm64K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm64K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm64K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm64K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm64K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm64K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm64K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm64K
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm64K
|
|
MOVL SI, R9
|
|
SHRL $0x10, R9
|
|
MOVB $0xf8, (CX)
|
|
MOVW SI, 1(CX)
|
|
MOVB R9, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm64K
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R9, R12
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R9
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R12
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DI, R12
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm64K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm64K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm64K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm64K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm64K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm64K
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm64K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm64K:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm64K
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm64K:
|
|
MOVQ tmp+48(FP), SI
|
|
MOVQ $0x0000cf1bbcdcbf9b, DI
|
|
MOVQ $0x9e3779b1, R8
|
|
LEAQ 1(BX), BX
|
|
LEAQ -2(AX), R9
|
|
MOVQ (DX)(BX*1), R10
|
|
MOVQ 1(DX)(BX*1), R11
|
|
MOVQ (DX)(R9*1), R12
|
|
MOVQ 1(DX)(R9*1), R13
|
|
SHLQ $0x10, R10
|
|
IMULQ DI, R10
|
|
SHRQ $0x31, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x34, R11
|
|
SHLQ $0x10, R12
|
|
IMULQ DI, R12
|
|
SHRQ $0x31, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x34, R13
|
|
LEAQ 1(BX), R8
|
|
LEAQ 1(R9), R14
|
|
MOVW BX, (SI)(R10*2)
|
|
MOVW R9, (SI)(R12*2)
|
|
LEAQ 1(R9)(BX*1), R10
|
|
SHRQ $0x01, R10
|
|
ADDQ $0x01, BX
|
|
SUBQ $0x01, R9
|
|
MOVW R8, 65536(SI)(R11*2)
|
|
MOVW R14, 65536(SI)(R13*2)
|
|
|
|
index_loop_encodeBetterBlockAsm64K:
|
|
CMPQ R10, R9
|
|
JAE search_loop_encodeBetterBlockAsm64K
|
|
MOVQ (DX)(BX*1), R8
|
|
MOVQ (DX)(R10*1), R11
|
|
SHLQ $0x10, R8
|
|
IMULQ DI, R8
|
|
SHRQ $0x31, R8
|
|
SHLQ $0x10, R11
|
|
IMULQ DI, R11
|
|
SHRQ $0x31, R11
|
|
MOVW BX, (SI)(R8*2)
|
|
MOVW R9, (SI)(R11*2)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, R10
|
|
JMP index_loop_encodeBetterBlockAsm64K
|
|
|
|
emit_remainder_encodeBetterBlockAsm64K:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 4(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm64K:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm64K
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm64K
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm64K
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm64K
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (CX)
|
|
MOVW DX, 1(CX)
|
|
MOVB BL, 3(CX)
|
|
ADDQ $0x04, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm64K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm64K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm64K
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm64K:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm64K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm64K
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm64K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm64K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm16K(dst []byte, src []byte, tmp *[36864]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm16K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000120, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm16K:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm16K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -11(AX), DX
|
|
LEAQ -8(AX), BX
|
|
MOVL BX, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm16K:
|
|
MOVQ tmp+48(FP), BX
|
|
MOVL AX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 1(AX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm16K
|
|
MOVQ (DX)(AX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x35, R11
|
|
MOVWLZX (BX)(R10*2), SI
|
|
MOVWLZX 32768(BX)(R11*2), R8
|
|
MOVW AX, (BX)(R10*2)
|
|
MOVW AX, 32768(BX)(R11*2)
|
|
MOVQ (DX)(SI*1), R10
|
|
CMPQ R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm16K
|
|
MOVQ (DX)(R8*1), R11
|
|
CMPQ R11, DI
|
|
MOVL AX, R12
|
|
SUBL 16(SP), R12
|
|
MOVQ (DX)(R12*1), R12
|
|
MOVQ $0x000000ffffffff00, R13
|
|
XORQ DI, R12
|
|
TESTQ R13, R12
|
|
JNE no_repeat_found_encodeBetterBlockAsm16K
|
|
LEAL 1(AX), BX
|
|
MOVL 12(SP), SI
|
|
MOVL BX, DI
|
|
SUBL 16(SP), DI
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm16K:
|
|
CMPL BX, SI
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm16K
|
|
MOVB -1(DX)(DI*1), R8
|
|
MOVB -1(DX)(BX*1), R9
|
|
CMPB R8, R9
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm16K
|
|
LEAL -1(BX), BX
|
|
DECL DI
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm16K
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm16K:
|
|
MOVL BX, SI
|
|
SUBL 12(SP), SI
|
|
LEAQ 3(CX)(SI*1), SI
|
|
CMPQ SI, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm16K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm16K:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm16K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm16K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm16K
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm16K
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm16K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm16K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm16K
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R11
|
|
SUBQ R9, R11
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), R9
|
|
LEAQ -32(CX)(R11*1), R12
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(CX)(R11*1)
|
|
MOVOA X5, -16(CX)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DI, R11
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm16K:
|
|
ADDL $0x05, AX
|
|
MOVL AX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K:
|
|
MOVQ (R8)(R10*1), R9
|
|
MOVQ 8(R8)(R10*1), R11
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K
|
|
XORQ 8(SI)(R10*1), R11
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R10), R10
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm16K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL 8(R10)(R11*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm16K
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm16K
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm16K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm16K
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm16K
|
|
MOVL (R8)(R10*1), R9
|
|
CMPL (SI)(R10*1), R9
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm16K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R10), R10
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm16K
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm16K
|
|
MOVW (R8)(R10*1), R9
|
|
CMPW (SI)(R10*1), R9
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm16K
|
|
LEAL 2(R10), R10
|
|
SUBL $0x02, DI
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm16K
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm16K:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm16K
|
|
LEAL 1(R10), R10
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm16K:
|
|
ADDL R10, AX
|
|
MOVL AX, SI
|
|
SUBL BX, SI
|
|
MOVL 16(SP), BX
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), BX
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm16K
|
|
LEAL -30(SI), BX
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm16K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL BX, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm16K
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm16K
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm16K
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm16K:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(SI*8), BX
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm16K:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm16K
|
|
|
|
no_repeat_found_encodeBetterBlockAsm16K:
|
|
CMPL R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm16K
|
|
CMPL R11, DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm16K
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm16K
|
|
|
|
candidateS_match_encodeBetterBlockAsm16K:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVWLZX (BX)(R10*2), SI
|
|
INCL AX
|
|
MOVW AX, (BX)(R10*2)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm16K
|
|
DECL AX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm16K:
|
|
MOVL 12(SP), BX
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm16K
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm16K:
|
|
CMPL AX, BX
|
|
JBE match_extend_back_end_encodeBetterBlockAsm16K
|
|
MOVB -1(DX)(SI*1), DI
|
|
MOVB -1(DX)(AX*1), R8
|
|
CMPB DI, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm16K
|
|
LEAL -1(AX), AX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm16K
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm16K
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm16K:
|
|
MOVL AX, BX
|
|
SUBL 12(SP), BX
|
|
LEAQ 3(CX)(BX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm16K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm16K:
|
|
MOVL AX, BX
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), R9
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K:
|
|
MOVQ (R8)(R11*1), R10
|
|
MOVQ 8(R8)(R11*1), R12
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K
|
|
XORQ 8(R9)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm16K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm16K
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm16K
|
|
MOVQ (R8)(R11*1), R10
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm16K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm16K
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm16K
|
|
MOVL (R8)(R11*1), R10
|
|
CMPL (R9)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm16K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm16K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm16K
|
|
JB match_nolit_end_encodeBetterBlockAsm16K
|
|
MOVW (R8)(R11*1), R10
|
|
CMPW (R9)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm16K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBetterBlockAsm16K
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm16K:
|
|
MOVB (R8)(R11*1), R10
|
|
CMPB (R9)(R11*1), R10
|
|
JNE match_nolit_end_encodeBetterBlockAsm16K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBetterBlockAsm16K:
|
|
MOVL AX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R8
|
|
MOVL BX, SI
|
|
SUBL R8, SI
|
|
JZ match_emit_nolits_encodeBetterBlockAsm16K
|
|
CMPL DI, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm16K
|
|
CMPL SI, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm16K
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, DI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, DI
|
|
CMOVLLT R11, DI
|
|
LEAL -1(SI)(DI*4), DI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(DI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm16K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm16K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm16K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
match_emit_lits_encodeBetterBlockAsm16K:
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm16K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm16K
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm16K
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm16K
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm16K
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm16K
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm16K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ SI, $0x08
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8:
|
|
MOVQ (R8), R10
|
|
MOVQ R10, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16:
|
|
MOVQ (R8), R10
|
|
MOVQ -8(R8)(SI*1), R8
|
|
MOVQ R10, (CX)
|
|
MOVQ R8, -8(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm16K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm16K
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm16K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm16K
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ SI, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm16K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm16K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm16K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm16K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm16K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm16K:
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm16K:
|
|
// emitCopy2
|
|
LEAL -64(DI), DI
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm16K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm16K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm16K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm16K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm16K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm16K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm16K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm16K
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm16K
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R9, R12
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R9
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R12
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DI, R12
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm16K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm16K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm16K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm16K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm16K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm16K
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm16K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm16K:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm16K
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm16K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm16K:
|
|
MOVQ tmp+48(FP), SI
|
|
MOVQ $0x0000cf1bbcdcbf9b, DI
|
|
MOVQ $0x9e3779b1, R8
|
|
LEAQ 1(BX), BX
|
|
LEAQ -2(AX), R9
|
|
MOVQ (DX)(BX*1), R10
|
|
MOVQ 1(DX)(BX*1), R11
|
|
MOVQ (DX)(R9*1), R12
|
|
MOVQ 1(DX)(R9*1), R13
|
|
SHLQ $0x10, R10
|
|
IMULQ DI, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x35, R11
|
|
SHLQ $0x10, R12
|
|
IMULQ DI, R12
|
|
SHRQ $0x32, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x35, R13
|
|
LEAQ 1(BX), R8
|
|
LEAQ 1(R9), R14
|
|
MOVW BX, (SI)(R10*2)
|
|
MOVW R9, (SI)(R12*2)
|
|
LEAQ 1(R9)(BX*1), R10
|
|
SHRQ $0x01, R10
|
|
ADDQ $0x01, BX
|
|
SUBQ $0x01, R9
|
|
MOVW R8, 32768(SI)(R11*2)
|
|
MOVW R14, 32768(SI)(R13*2)
|
|
|
|
index_loop_encodeBetterBlockAsm16K:
|
|
CMPQ R10, R9
|
|
JAE search_loop_encodeBetterBlockAsm16K
|
|
MOVQ (DX)(BX*1), R8
|
|
MOVQ (DX)(R10*1), R11
|
|
SHLQ $0x10, R8
|
|
IMULQ DI, R8
|
|
SHRQ $0x32, R8
|
|
SHLQ $0x10, R11
|
|
IMULQ DI, R11
|
|
SHRQ $0x32, R11
|
|
MOVW BX, (SI)(R8*2)
|
|
MOVW R9, (SI)(R11*2)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, R10
|
|
JMP index_loop_encodeBetterBlockAsm16K
|
|
|
|
emit_remainder_encodeBetterBlockAsm16K:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 3(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm16K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm16K:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm16K
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm16K
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm16K
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm16K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm16K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm16K
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm16K:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm16K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm16K
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm16K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm16K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm4K(dst []byte, src []byte, tmp *[10240]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm4K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000050, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm4K:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm4K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -11(AX), DX
|
|
LEAQ -8(AX), BX
|
|
MOVL BX, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm4K:
|
|
MOVQ tmp+48(FP), BX
|
|
MOVL AX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 1(AX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm4K
|
|
MOVQ (DX)(AX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x36, R11
|
|
MOVWLZX (BX)(R10*2), SI
|
|
MOVWLZX 8192(BX)(R11*2), R8
|
|
MOVW AX, (BX)(R10*2)
|
|
MOVW AX, 8192(BX)(R11*2)
|
|
MOVQ (DX)(SI*1), R10
|
|
CMPQ R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm4K
|
|
MOVQ (DX)(R8*1), R11
|
|
CMPQ R11, DI
|
|
MOVL AX, R12
|
|
SUBL 16(SP), R12
|
|
MOVQ (DX)(R12*1), R12
|
|
MOVQ $0x000000ffffffff00, R13
|
|
XORQ DI, R12
|
|
TESTQ R13, R12
|
|
JNE no_repeat_found_encodeBetterBlockAsm4K
|
|
LEAL 1(AX), BX
|
|
MOVL 12(SP), SI
|
|
MOVL BX, DI
|
|
SUBL 16(SP), DI
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm4K:
|
|
CMPL BX, SI
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm4K
|
|
MOVB -1(DX)(DI*1), R8
|
|
MOVB -1(DX)(BX*1), R9
|
|
CMPB R8, R9
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm4K
|
|
LEAL -1(BX), BX
|
|
DECL DI
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm4K
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm4K:
|
|
MOVL BX, SI
|
|
SUBL 12(SP), SI
|
|
LEAQ 3(CX)(SI*1), SI
|
|
CMPQ SI, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm4K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm4K:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm4K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm4K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm4K
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm4K
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm4K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm4K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4K
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R11
|
|
SUBQ R9, R11
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), R9
|
|
LEAQ -32(CX)(R11*1), R12
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(CX)(R11*1)
|
|
MOVOA X5, -16(CX)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DI, R11
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm4K:
|
|
ADDL $0x05, AX
|
|
MOVL AX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K:
|
|
MOVQ (R8)(R10*1), R9
|
|
MOVQ 8(R8)(R10*1), R11
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K
|
|
XORQ 8(SI)(R10*1), R11
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R10), R10
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm4K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL 8(R10)(R11*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm4K
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm4K
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm4K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm4K
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm4K
|
|
MOVL (R8)(R10*1), R9
|
|
CMPL (SI)(R10*1), R9
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm4K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R10), R10
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm4K
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm4K
|
|
MOVW (R8)(R10*1), R9
|
|
CMPW (SI)(R10*1), R9
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm4K
|
|
LEAL 2(R10), R10
|
|
SUBL $0x02, DI
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm4K
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm4K:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm4K
|
|
LEAL 1(R10), R10
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm4K:
|
|
ADDL R10, AX
|
|
MOVL AX, SI
|
|
SUBL BX, SI
|
|
MOVL 16(SP), BX
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), BX
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm4K
|
|
LEAL -30(SI), BX
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm4K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL BX, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm4K
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm4K
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm4K
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm4K:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(SI*8), BX
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm4K:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm4K
|
|
|
|
no_repeat_found_encodeBetterBlockAsm4K:
|
|
CMPL R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm4K
|
|
CMPL R11, DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm4K
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm4K
|
|
|
|
candidateS_match_encodeBetterBlockAsm4K:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
MOVWLZX (BX)(R10*2), SI
|
|
INCL AX
|
|
MOVW AX, (BX)(R10*2)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm4K
|
|
DECL AX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm4K:
|
|
MOVL 12(SP), BX
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm4K
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm4K:
|
|
CMPL AX, BX
|
|
JBE match_extend_back_end_encodeBetterBlockAsm4K
|
|
MOVB -1(DX)(SI*1), DI
|
|
MOVB -1(DX)(AX*1), R8
|
|
CMPB DI, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm4K
|
|
LEAL -1(AX), AX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm4K
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm4K
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm4K:
|
|
MOVL AX, BX
|
|
SUBL 12(SP), BX
|
|
LEAQ 3(CX)(BX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm4K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm4K:
|
|
MOVL AX, BX
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), R9
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K:
|
|
MOVQ (R8)(R11*1), R10
|
|
MOVQ 8(R8)(R11*1), R12
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K
|
|
XORQ 8(R9)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm4K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm4K
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm4K
|
|
MOVQ (R8)(R11*1), R10
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm4K
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm4K
|
|
MOVL (R8)(R11*1), R10
|
|
CMPL (R9)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm4K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm4K
|
|
JB match_nolit_end_encodeBetterBlockAsm4K
|
|
MOVW (R8)(R11*1), R10
|
|
CMPW (R9)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBetterBlockAsm4K
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm4K:
|
|
MOVB (R8)(R11*1), R10
|
|
CMPB (R9)(R11*1), R10
|
|
JNE match_nolit_end_encodeBetterBlockAsm4K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBetterBlockAsm4K:
|
|
MOVL AX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R8
|
|
MOVL BX, SI
|
|
SUBL R8, SI
|
|
JZ match_emit_nolits_encodeBetterBlockAsm4K
|
|
CMPL DI, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm4K
|
|
CMPL SI, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm4K
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, DI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, DI
|
|
CMOVLLT R11, DI
|
|
LEAL -1(SI)(DI*4), DI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(DI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm4K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm4K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm4K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
match_emit_lits_encodeBetterBlockAsm4K:
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm4K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm4K
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm4K
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm4K
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm4K
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm4K
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm4K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ SI, $0x08
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8:
|
|
MOVQ (R8), R10
|
|
MOVQ R10, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16:
|
|
MOVQ (R8), R10
|
|
MOVQ -8(R8)(SI*1), R8
|
|
MOVQ R10, (CX)
|
|
MOVQ R8, -8(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm4K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm4K
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm4K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm4K
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ SI, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm4K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm4K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm4K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm4K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm4K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm4K:
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm4K:
|
|
// emitCopy2
|
|
LEAL -64(DI), DI
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm4K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm4K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm4K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm4K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm4K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm4K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm4K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4K
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm4K
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R9, R12
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R9
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R12
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DI, R12
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm4K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm4K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm4K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm4K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4K
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm4K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm4K:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm4K
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm4K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm4K:
|
|
MOVQ tmp+48(FP), SI
|
|
MOVQ $0x0000cf1bbcdcbf9b, DI
|
|
MOVQ $0x9e3779b1, R8
|
|
LEAQ 1(BX), BX
|
|
LEAQ -2(AX), R9
|
|
MOVQ (DX)(BX*1), R10
|
|
MOVQ 1(DX)(BX*1), R11
|
|
MOVQ (DX)(R9*1), R12
|
|
MOVQ 1(DX)(R9*1), R13
|
|
SHLQ $0x10, R10
|
|
IMULQ DI, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x36, R11
|
|
SHLQ $0x10, R12
|
|
IMULQ DI, R12
|
|
SHRQ $0x34, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x36, R13
|
|
LEAQ 1(BX), R8
|
|
LEAQ 1(R9), R14
|
|
MOVW BX, (SI)(R10*2)
|
|
MOVW R9, (SI)(R12*2)
|
|
LEAQ 1(R9)(BX*1), R10
|
|
SHRQ $0x01, R10
|
|
ADDQ $0x01, BX
|
|
SUBQ $0x01, R9
|
|
MOVW R8, 8192(SI)(R11*2)
|
|
MOVW R14, 8192(SI)(R13*2)
|
|
|
|
index_loop_encodeBetterBlockAsm4K:
|
|
CMPQ R10, R9
|
|
JAE search_loop_encodeBetterBlockAsm4K
|
|
MOVQ (DX)(BX*1), R8
|
|
MOVQ (DX)(R10*1), R11
|
|
SHLQ $0x10, R8
|
|
IMULQ DI, R8
|
|
SHRQ $0x34, R8
|
|
SHLQ $0x10, R11
|
|
IMULQ DI, R11
|
|
SHRQ $0x34, R11
|
|
MOVW BX, (SI)(R8*2)
|
|
MOVW R9, (SI)(R11*2)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, R10
|
|
JMP index_loop_encodeBetterBlockAsm4K
|
|
|
|
emit_remainder_encodeBetterBlockAsm4K:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 3(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm4K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm4K:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4K
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm4K
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm4K
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm4K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm4K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm4K
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm4K:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm4K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4K
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm4K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm4K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm1K(dst []byte, src []byte, tmp *[4608]byte) int
|
|
// Requires: BMI, CMOV, SSE2
|
|
TEXT ·encodeBetterBlockAsm1K(SB), $24-64
|
|
MOVQ tmp+48(FP), AX
|
|
MOVQ dst_base+0(FP), CX
|
|
MOVQ $0x00000024, DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm1K:
|
|
MOVOU X0, (AX)
|
|
MOVOU X0, 16(AX)
|
|
MOVOU X0, 32(AX)
|
|
MOVOU X0, 48(AX)
|
|
MOVOU X0, 64(AX)
|
|
MOVOU X0, 80(AX)
|
|
MOVOU X0, 96(AX)
|
|
MOVOU X0, 112(AX)
|
|
ADDQ $0x80, AX
|
|
DECQ DX
|
|
JNZ zero_loop_encodeBetterBlockAsm1K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), AX
|
|
LEAQ -11(AX), DX
|
|
LEAQ -8(AX), BX
|
|
MOVL BX, 8(SP)
|
|
SHRQ $0x05, AX
|
|
SUBL AX, DX
|
|
LEAQ (CX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, AX
|
|
MOVL AX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm1K:
|
|
MOVQ tmp+48(FP), BX
|
|
MOVL AX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x04, SI
|
|
LEAL 1(AX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm1K
|
|
MOVQ (DX)(AX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x35, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x38, R11
|
|
MOVWLZX (BX)(R10*2), SI
|
|
MOVWLZX 4096(BX)(R11*2), R8
|
|
MOVW AX, (BX)(R10*2)
|
|
MOVW AX, 4096(BX)(R11*2)
|
|
MOVQ (DX)(SI*1), R10
|
|
CMPQ R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm1K
|
|
MOVQ (DX)(R8*1), R11
|
|
CMPQ R11, DI
|
|
MOVL AX, R12
|
|
SUBL 16(SP), R12
|
|
MOVQ (DX)(R12*1), R12
|
|
MOVQ $0x000000ffffffff00, R13
|
|
XORQ DI, R12
|
|
TESTQ R13, R12
|
|
JNE no_repeat_found_encodeBetterBlockAsm1K
|
|
LEAL 1(AX), BX
|
|
MOVL 12(SP), SI
|
|
MOVL BX, DI
|
|
SUBL 16(SP), DI
|
|
JZ repeat_extend_back_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_extend_back_loop_encodeBetterBlockAsm1K:
|
|
CMPL BX, SI
|
|
JBE repeat_extend_back_end_encodeBetterBlockAsm1K
|
|
MOVB -1(DX)(DI*1), R8
|
|
MOVB -1(DX)(BX*1), R9
|
|
CMPB R8, R9
|
|
JNE repeat_extend_back_end_encodeBetterBlockAsm1K
|
|
LEAL -1(BX), BX
|
|
DECL DI
|
|
JNZ repeat_extend_back_loop_encodeBetterBlockAsm1K
|
|
|
|
repeat_extend_back_end_encodeBetterBlockAsm1K:
|
|
MOVL BX, SI
|
|
SUBL 12(SP), SI
|
|
LEAQ 3(CX)(SI*1), SI
|
|
CMPQ SI, (SP)
|
|
JB repeat_dst_size_check_encodeBetterBlockAsm1K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
repeat_dst_size_check_encodeBetterBlockAsm1K:
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm1K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_repeat_emit_encodeBetterBlockAsm1K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_repeat_emit_encodeBetterBlockAsm1K
|
|
JB three_bytes_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
three_bytes_repeat_emit_encodeBetterBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
two_bytes_repeat_emit_encodeBetterBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midrepeat_emit_encodeBetterBlockAsm1K
|
|
JMP memmove_long_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
one_byte_repeat_emit_encodeBetterBlockAsm1K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
memmove_midrepeat_emit_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm1K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm1K
|
|
|
|
memmove_long_repeat_emit_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R11
|
|
SUBQ R9, R11
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), R9
|
|
LEAQ -32(CX)(R11*1), R12
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(CX)(R11*1)
|
|
MOVOA X5, -16(CX)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DI, R11
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_repeat_emit_encodeBetterBlockAsm1K:
|
|
ADDL $0x05, AX
|
|
MOVL AX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K
|
|
|
|
matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K:
|
|
MOVQ (R8)(R10*1), R9
|
|
MOVQ 8(R8)(R10*1), R11
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K
|
|
XORQ 8(SI)(R10*1), R11
|
|
JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R10), R10
|
|
|
|
matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K
|
|
JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm1K
|
|
|
|
matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R11, R11
|
|
|
|
#else
|
|
BSFQ R11, R11
|
|
|
|
#endif
|
|
SARQ $0x03, R11
|
|
LEAL 8(R10)(R11*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm1K
|
|
|
|
matchlen_match8_repeat_extend_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_repeat_extend_encodeBetterBlockAsm1K
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm1K
|
|
|
|
matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R9, R9
|
|
|
|
#else
|
|
BSFQ R9, R9
|
|
|
|
#endif
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP repeat_extend_forward_end_encodeBetterBlockAsm1K
|
|
|
|
matchlen_match4_repeat_extend_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_repeat_extend_encodeBetterBlockAsm1K
|
|
MOVL (R8)(R10*1), R9
|
|
CMPL (SI)(R10*1), R9
|
|
JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm1K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R10), R10
|
|
|
|
matchlen_match2_repeat_extend_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_repeat_extend_encodeBetterBlockAsm1K
|
|
JB repeat_extend_forward_end_encodeBetterBlockAsm1K
|
|
MOVW (R8)(R10*1), R9
|
|
CMPW (SI)(R10*1), R9
|
|
JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm1K
|
|
LEAL 2(R10), R10
|
|
SUBL $0x02, DI
|
|
JZ repeat_extend_forward_end_encodeBetterBlockAsm1K
|
|
|
|
matchlen_match1_repeat_extend_encodeBetterBlockAsm1K:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE repeat_extend_forward_end_encodeBetterBlockAsm1K
|
|
LEAL 1(R10), R10
|
|
|
|
repeat_extend_forward_end_encodeBetterBlockAsm1K:
|
|
ADDL R10, AX
|
|
MOVL AX, SI
|
|
SUBL BX, SI
|
|
MOVL 16(SP), BX
|
|
|
|
// emitRepeat
|
|
LEAL -1(SI), BX
|
|
CMPL SI, $0x1d
|
|
JBE repeat_one_match_repeat_encodeBetterBlockAsm1K
|
|
LEAL -30(SI), BX
|
|
CMPL SI, $0x0000011e
|
|
JB repeat_two_match_repeat_encodeBetterBlockAsm1K
|
|
CMPL SI, $0x0001001e
|
|
JB repeat_three_match_repeat_encodeBetterBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL BX, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm1K
|
|
|
|
repeat_three_match_repeat_encodeBetterBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW BX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm1K
|
|
|
|
repeat_two_match_repeat_encodeBetterBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB BL, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP repeat_end_emit_encodeBetterBlockAsm1K
|
|
|
|
repeat_one_match_repeat_encodeBetterBlockAsm1K:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(SI*8), BX
|
|
MOVB BL, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
repeat_end_emit_encodeBetterBlockAsm1K:
|
|
MOVL AX, 12(SP)
|
|
JMP search_loop_encodeBetterBlockAsm1K
|
|
|
|
no_repeat_found_encodeBetterBlockAsm1K:
|
|
CMPL R10, DI
|
|
JEQ candidate_match_encodeBetterBlockAsm1K
|
|
CMPL R11, DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm1K
|
|
MOVL 20(SP), AX
|
|
JMP search_loop_encodeBetterBlockAsm1K
|
|
|
|
candidateS_match_encodeBetterBlockAsm1K:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x35, R10
|
|
MOVWLZX (BX)(R10*2), SI
|
|
INCL AX
|
|
MOVW AX, (BX)(R10*2)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm1K
|
|
DECL AX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm1K:
|
|
MOVL 12(SP), BX
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm1K
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm1K:
|
|
CMPL AX, BX
|
|
JBE match_extend_back_end_encodeBetterBlockAsm1K
|
|
MOVB -1(DX)(SI*1), DI
|
|
MOVB -1(DX)(AX*1), R8
|
|
CMPB DI, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm1K
|
|
LEAL -1(AX), AX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm1K
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm1K
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm1K:
|
|
MOVL AX, BX
|
|
SUBL 12(SP), BX
|
|
LEAQ 3(CX)(BX*1), BX
|
|
CMPQ BX, (SP)
|
|
JB match_dst_size_check_encodeBetterBlockAsm1K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm1K:
|
|
MOVL AX, BX
|
|
ADDL $0x04, AX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL AX, DI
|
|
LEAQ (DX)(AX*1), R8
|
|
LEAQ (DX)(SI*1), R9
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K
|
|
|
|
matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K:
|
|
MOVQ (R8)(R11*1), R10
|
|
MOVQ 8(R8)(R11*1), R12
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K
|
|
XORQ 8(R9)(R11*1), R12
|
|
JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K
|
|
LEAL -16(DI), DI
|
|
LEAL 16(R11), R11
|
|
|
|
matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x10
|
|
JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K
|
|
JMP matchlen_match8_match_nolit_encodeBetterBlockAsm1K
|
|
|
|
matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R12, R12
|
|
|
|
#else
|
|
BSFQ R12, R12
|
|
|
|
#endif
|
|
SARQ $0x03, R12
|
|
LEAL 8(R11)(R12*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm1K
|
|
|
|
matchlen_match8_match_nolit_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x08
|
|
JB matchlen_match4_match_nolit_encodeBetterBlockAsm1K
|
|
MOVQ (R8)(R11*1), R10
|
|
XORQ (R9)(R11*1), R10
|
|
JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R11), R11
|
|
JMP matchlen_match4_match_nolit_encodeBetterBlockAsm1K
|
|
|
|
matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ R10, R10
|
|
|
|
#else
|
|
BSFQ R10, R10
|
|
|
|
#endif
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP match_nolit_end_encodeBetterBlockAsm1K
|
|
|
|
matchlen_match4_match_nolit_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x04
|
|
JB matchlen_match2_match_nolit_encodeBetterBlockAsm1K
|
|
MOVL (R8)(R11*1), R10
|
|
CMPL (R9)(R11*1), R10
|
|
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm1K
|
|
LEAL -4(DI), DI
|
|
LEAL 4(R11), R11
|
|
|
|
matchlen_match2_match_nolit_encodeBetterBlockAsm1K:
|
|
CMPL DI, $0x01
|
|
JE matchlen_match1_match_nolit_encodeBetterBlockAsm1K
|
|
JB match_nolit_end_encodeBetterBlockAsm1K
|
|
MOVW (R8)(R11*1), R10
|
|
CMPW (R9)(R11*1), R10
|
|
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm1K
|
|
LEAL 2(R11), R11
|
|
SUBL $0x02, DI
|
|
JZ match_nolit_end_encodeBetterBlockAsm1K
|
|
|
|
matchlen_match1_match_nolit_encodeBetterBlockAsm1K:
|
|
MOVB (R8)(R11*1), R10
|
|
CMPB (R9)(R11*1), R10
|
|
JNE match_nolit_end_encodeBetterBlockAsm1K
|
|
LEAL 1(R11), R11
|
|
|
|
match_nolit_end_encodeBetterBlockAsm1K:
|
|
MOVL AX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
|
|
// Check if we can combine lit+copy
|
|
MOVLQZX 12(SP), R8
|
|
MOVL BX, SI
|
|
SUBL R8, SI
|
|
JZ match_emit_nolits_encodeBetterBlockAsm1K
|
|
CMPL DI, $0x00000040
|
|
JL match_emit_lits_encodeBetterBlockAsm1K
|
|
CMPL SI, $0x04
|
|
JA match_emit_lits_encodeBetterBlockAsm1K
|
|
MOVL (DX)(R8*1), R8
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R9, R9
|
|
SUBL $0x40, DI
|
|
LEAL -11(R11), R10
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x07
|
|
CMOVLGE R10, R9
|
|
MOVQ $0x00000007, DI
|
|
CMOVLLT R11, DI
|
|
LEAL -1(SI)(DI*4), DI
|
|
MOVL $0x00000003, R10
|
|
LEAL (R10)(DI*8), DI
|
|
MOVB DI, (CX)
|
|
ADDQ $0x03, CX
|
|
MOVL R8, (CX)
|
|
ADDQ SI, CX
|
|
TESTL R9, R9
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), SI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm1K
|
|
LEAL -30(R9), SI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm1K
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm1K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R9*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
match_emit_lits_encodeBetterBlockAsm1K:
|
|
LEAQ (DX)(R8*1), R8
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), R9
|
|
CMPL R9, $0x1d
|
|
JB one_byte_match_emit_encodeBetterBlockAsm1K
|
|
SUBL $0x1d, R9
|
|
CMPL R9, $0x00000100
|
|
JB two_bytes_match_emit_encodeBetterBlockAsm1K
|
|
JB three_bytes_match_emit_encodeBetterBlockAsm1K
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW R9, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, R9
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm1K
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB R9, 1(CX)
|
|
ADDL $0x1d, R9
|
|
ADDQ $0x02, CX
|
|
CMPL R9, $0x40
|
|
JB memmove_midmatch_emit_encodeBetterBlockAsm1K
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm1K
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm1K:
|
|
SHLB $0x03, R9
|
|
MOVB R9, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ SI, $0x08
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8
|
|
CMPQ SI, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8:
|
|
MOVQ (R8), R10
|
|
MOVQ R10, (CX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16:
|
|
MOVQ (R8), R10
|
|
MOVQ -8(R8)(SI*1), R8
|
|
MOVQ R10, (CX)
|
|
MOVQ R8, -8(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm1K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm1K
|
|
|
|
memmove_midmatch_emit_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ SI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(SI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(SI*1)
|
|
JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_encodeBetterBlockAsm1K:
|
|
MOVQ R9, CX
|
|
JMP match_emit_nolits_encodeBetterBlockAsm1K
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(SI*1), R9
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(SI*1), X2
|
|
MOVOU -16(R8)(SI*1), X3
|
|
MOVQ SI, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ CX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(CX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(CX)(R13*1)
|
|
MOVOA X5, -16(CX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ SI, R13
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(SI*1)
|
|
MOVOU X3, -16(CX)(SI*1)
|
|
MOVQ R9, CX
|
|
|
|
match_emit_nolits_encodeBetterBlockAsm1K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x00000400
|
|
JA two_byte_match_nolit_encodeBetterBlockAsm1K
|
|
CMPL R11, $0x00000013
|
|
JAE emit_one_longer_match_nolit_encodeBetterBlockAsm1K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL -15(SI)(R11*4), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
emit_one_longer_match_nolit_encodeBetterBlockAsm1K:
|
|
CMPL R11, $0x00000112
|
|
JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm1K
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 61(SI), SI
|
|
MOVW SI, (CX)
|
|
LEAL -18(R11), SI
|
|
MOVB SI, 2(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
emit_copy1_repeat_match_nolit_encodeBetterBlockAsm1K:
|
|
LEAL -1(DI), SI
|
|
SHLL $0x06, SI
|
|
LEAL 57(SI), SI
|
|
MOVW SI, (CX)
|
|
ADDQ $0x02, CX
|
|
SUBL $0x12, R11
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
two_byte_match_nolit_encodeBetterBlockAsm1K:
|
|
// emitCopy2
|
|
LEAL -64(DI), DI
|
|
LEAL -4(R11), R11
|
|
MOVW DI, 1(CX)
|
|
CMPL R11, $0x3c
|
|
JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm1K_emit2
|
|
LEAL -60(R11), SI
|
|
CMPL R11, $0x0000013c
|
|
JB emit_copy2_1_match_nolit_encodeBetterBlockAsm1K_emit2
|
|
CMPL R11, $0x0001003c
|
|
JB emit_copy2_2_match_nolit_encodeBetterBlockAsm1K_emit2
|
|
MOVB $0xfe, (CX)
|
|
MOVL SI, 3(CX)
|
|
ADDQ $0x06, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
emit_copy2_2_match_nolit_encodeBetterBlockAsm1K_emit2:
|
|
MOVB $0xfa, (CX)
|
|
MOVW SI, 3(CX)
|
|
ADDQ $0x05, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
emit_copy2_1_match_nolit_encodeBetterBlockAsm1K_emit2:
|
|
MOVB $0xf6, (CX)
|
|
MOVB SI, 3(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
emit_copy2_0_match_nolit_encodeBetterBlockAsm1K_emit2:
|
|
MOVL $0x00000002, SI
|
|
LEAL (SI)(R11*4), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), SI
|
|
CMPL SI, BX
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K
|
|
MOVL BX, DI
|
|
MOVL BX, 12(SP)
|
|
LEAQ (DX)(SI*1), R8
|
|
SUBL SI, DI
|
|
|
|
// emitLiteral
|
|
LEAL -1(DI), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_match_emit_repeat_encodeBetterBlockAsm1K
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_match_emit_repeat_encodeBetterBlockAsm1K
|
|
JB three_bytes_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, CX
|
|
CMPL SI, $0x40
|
|
JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm1K
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 1
|
|
CMPQ DI, $0x08
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8
|
|
CMPQ DI, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8:
|
|
MOVQ (R8), R9
|
|
MOVQ R9, (CX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16:
|
|
MOVQ (R8), R9
|
|
MOVQ -8(R8)(DI*1), R8
|
|
MOVQ R9, (CX)
|
|
MOVQ R8, -8(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
memmove_midmatch_emit_repeat_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveShort
|
|
// margin: 8, min move: 30
|
|
CMPQ DI, $0x20
|
|
JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DI*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(DI*1)
|
|
JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
|
|
memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
MOVQ SI, CX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(DI*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DI*1), X2
|
|
MOVOU -16(R8)(DI*1), X3
|
|
MOVQ DI, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ CX, R9
|
|
ANDL $0x0000001f, R9
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R9, R12
|
|
DECQ R10
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R12*1), R9
|
|
LEAQ -32(CX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (R9), X4
|
|
MOVOU 16(R9), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, R12
|
|
DECQ R10
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R12*1), X4
|
|
MOVOU -16(R8)(R12*1), X5
|
|
MOVOA X4, -32(CX)(R12*1)
|
|
MOVOA X5, -16(CX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DI, R12
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(DI*1)
|
|
MOVOU X3, -16(CX)(DI*1)
|
|
MOVQ SI, CX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K:
|
|
ADDL R11, AX
|
|
ADDL $0x04, R11
|
|
MOVL AX, 12(SP)
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), SI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm1K
|
|
LEAL -30(R11), SI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm1K
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm1K
|
|
MOVB $0xfc, (CX)
|
|
MOVL SI, 1(CX)
|
|
ADDQ $0x04, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm1K:
|
|
MOVB $0xf4, (CX)
|
|
MOVW SI, 1(CX)
|
|
ADDQ $0x03, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm1K:
|
|
MOVB $0xec, (CX)
|
|
MOVB SI, 1(CX)
|
|
ADDQ $0x02, CX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm1K
|
|
|
|
repeat_one_match_nolit_repeat_encodeBetterBlockAsm1K:
|
|
XORL SI, SI
|
|
LEAL -4(SI)(R11*8), SI
|
|
MOVB SI, (CX)
|
|
ADDQ $0x01, CX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm1K:
|
|
CMPL AX, 8(SP)
|
|
JAE emit_remainder_encodeBetterBlockAsm1K
|
|
CMPQ CX, (SP)
|
|
JB match_nolit_dst_ok_encodeBetterBlockAsm1K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm1K:
|
|
MOVQ tmp+48(FP), SI
|
|
MOVQ $0x0000cf1bbcdcbf9b, DI
|
|
MOVQ $0x9e3779b1, R8
|
|
LEAQ 1(BX), BX
|
|
LEAQ -2(AX), R9
|
|
MOVQ (DX)(BX*1), R10
|
|
MOVQ 1(DX)(BX*1), R11
|
|
MOVQ (DX)(R9*1), R12
|
|
MOVQ 1(DX)(R9*1), R13
|
|
SHLQ $0x10, R10
|
|
IMULQ DI, R10
|
|
SHRQ $0x35, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x38, R11
|
|
SHLQ $0x10, R12
|
|
IMULQ DI, R12
|
|
SHRQ $0x35, R12
|
|
SHLQ $0x20, R13
|
|
IMULQ R8, R13
|
|
SHRQ $0x38, R13
|
|
LEAQ 1(BX), R8
|
|
LEAQ 1(R9), R14
|
|
MOVW BX, (SI)(R10*2)
|
|
MOVW R9, (SI)(R12*2)
|
|
LEAQ 1(R9)(BX*1), R10
|
|
SHRQ $0x01, R10
|
|
ADDQ $0x01, BX
|
|
SUBQ $0x01, R9
|
|
MOVW R8, 4096(SI)(R11*2)
|
|
MOVW R14, 4096(SI)(R13*2)
|
|
|
|
index_loop_encodeBetterBlockAsm1K:
|
|
CMPQ R10, R9
|
|
JAE search_loop_encodeBetterBlockAsm1K
|
|
MOVQ (DX)(BX*1), R8
|
|
MOVQ (DX)(R10*1), R11
|
|
SHLQ $0x10, R8
|
|
IMULQ DI, R8
|
|
SHRQ $0x35, R8
|
|
SHLQ $0x10, R11
|
|
IMULQ DI, R11
|
|
SHRQ $0x35, R11
|
|
MOVW BX, (SI)(R8*2)
|
|
MOVW R9, (SI)(R11*2)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, R10
|
|
JMP index_loop_encodeBetterBlockAsm1K
|
|
|
|
emit_remainder_encodeBetterBlockAsm1K:
|
|
MOVQ src_len+32(FP), AX
|
|
SUBL 12(SP), AX
|
|
LEAQ 3(CX)(AX*1), AX
|
|
CMPQ AX, (SP)
|
|
JB emit_remainder_ok_encodeBetterBlockAsm1K
|
|
MOVQ $0x00000000, ret+56(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm1K:
|
|
MOVQ src_len+32(FP), AX
|
|
|
|
// emitLiteralsDstP
|
|
MOVL 12(SP), BX
|
|
CMPL BX, AX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm1K
|
|
MOVL AX, SI
|
|
MOVL AX, 12(SP)
|
|
LEAQ (DX)(BX*1), AX
|
|
SUBL BX, SI
|
|
|
|
// emitLiteral
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x1d
|
|
JB one_byte_emit_remainder_encodeBetterBlockAsm1K
|
|
SUBL $0x1d, DX
|
|
CMPL DX, $0x00000100
|
|
JB two_bytes_emit_remainder_encodeBetterBlockAsm1K
|
|
JB three_bytes_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm1K:
|
|
MOVB $0xf0, (CX)
|
|
MOVW DX, 1(CX)
|
|
ADDQ $0x03, CX
|
|
ADDL $0x1d, DX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm1K:
|
|
MOVB $0xe8, (CX)
|
|
MOVB DL, 1(CX)
|
|
ADDL $0x1d, DX
|
|
ADDQ $0x02, CX
|
|
CMPL DX, $0x40
|
|
JB memmove_midemit_remainder_encodeBetterBlockAsm1K
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm1K:
|
|
SHLB $0x03, DL
|
|
MOVB DL, (CX)
|
|
ADDQ $0x01, CX
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ BX, $0x03
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_1or2
|
|
JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_3
|
|
CMPQ BX, $0x08
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_1or2:
|
|
MOVB (AX), SI
|
|
MOVB -1(AX)(BX*1), AL
|
|
MOVB SI, (CX)
|
|
MOVB AL, -1(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_3:
|
|
MOVW (AX), SI
|
|
MOVB 2(AX), AL
|
|
MOVW SI, (CX)
|
|
MOVB AL, 2(CX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8:
|
|
MOVL (AX), SI
|
|
MOVL -4(AX)(BX*1), AX
|
|
MOVL SI, (CX)
|
|
MOVL AX, -4(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16:
|
|
MOVQ (AX), SI
|
|
MOVQ -8(AX)(BX*1), AX
|
|
MOVQ SI, (CX)
|
|
MOVQ AX, -8(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
memmove_midemit_remainder_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
// margin: -2, min move: 30
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32:
|
|
MOVOU (AX), X0
|
|
MOVOU -16(AX)(BX*1), X1
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, -16(CX)(BX*1)
|
|
JMP memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
|
|
memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm1K:
|
|
MOVQ DX, CX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm1K
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm1K:
|
|
LEAQ (CX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU -32(AX)(BX*1), X2
|
|
MOVOU -16(AX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ CX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
LEAQ -32(AX)(R8*1), SI
|
|
LEAQ -32(CX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
|
|
MOVOU -32(AX)(R8*1), X4
|
|
MOVOU -16(AX)(R8*1), X5
|
|
MOVOA X4, -32(CX)(R8*1)
|
|
MOVOA X5, -16(CX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
|
|
MOVOU X0, (CX)
|
|
MOVOU X1, 16(CX)
|
|
MOVOU X2, -32(CX)(BX*1)
|
|
MOVOU X3, -16(CX)(BX*1)
|
|
MOVQ DX, CX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm1K:
|
|
MOVQ dst_base+0(FP), AX
|
|
SUBQ AX, CX
|
|
MOVQ CX, ret+56(FP)
|
|
RET
|
|
|
|
// func emitLiteral(dst []byte, lit []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
|
|
MOVQ lit_len+32(FP), DX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ lit_base+24(FP), CX
|
|
TESTQ DX, DX
|
|
JZ emit_literal_end_standalone_skip
|
|
|
|
// emitLiteral
|
|
MOVL DX, BX
|
|
LEAL -1(DX), SI
|
|
CMPL SI, $0x1d
|
|
JB one_byte_standalone
|
|
SUBL $0x1d, SI
|
|
CMPL SI, $0x00000100
|
|
JB two_bytes_standalone
|
|
CMPL SI, $0x00010000
|
|
JB three_bytes_standalone
|
|
MOVL SI, DI
|
|
SHRL $0x10, DI
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB DI, 3(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_standalone
|
|
|
|
three_bytes_standalone:
|
|
MOVB $0xf0, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
ADDL $0x1d, SI
|
|
JMP memmove_long_standalone
|
|
|
|
two_bytes_standalone:
|
|
MOVB $0xe8, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDL $0x1d, SI
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JB memmove_midstandalone
|
|
JMP memmove_long_standalone
|
|
|
|
one_byte_standalone:
|
|
SHLB $0x03, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, BX
|
|
ADDQ $0x01, AX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ DX, $0x03
|
|
JB emit_lit_memmove_standalone_memmove_move_1or2
|
|
JE emit_lit_memmove_standalone_memmove_move_3
|
|
CMPQ DX, $0x08
|
|
JBE emit_lit_memmove_standalone_memmove_move_4through8
|
|
CMPQ DX, $0x10
|
|
JBE emit_lit_memmove_standalone_memmove_move_8through16
|
|
CMPQ DX, $0x20
|
|
JBE emit_lit_memmove_standalone_memmove_move_17through32
|
|
JMP emit_lit_memmove_standalone_memmove_move_33through64
|
|
|
|
emit_lit_memmove_standalone_memmove_move_1or2:
|
|
MOVB (CX), SI
|
|
MOVB -1(CX)(DX*1), CL
|
|
MOVB SI, (AX)
|
|
MOVB CL, -1(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_3:
|
|
MOVW (CX), SI
|
|
MOVB 2(CX), CL
|
|
MOVW SI, (AX)
|
|
MOVB CL, 2(AX)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_4through8:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(DX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(DX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(DX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(DX*1), X2
|
|
MOVOU -16(CX)(DX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
JMP emit_literal_end_standalone
|
|
|
|
memmove_midstandalone:
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ DX, $0x20
|
|
JBE emit_lit_memmove_mid_standalone_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_standalone_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_standalone_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(DX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_mid_standalone_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(DX*1), X2
|
|
MOVOU -16(CX)(DX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
JMP emit_literal_end_standalone
|
|
|
|
memmove_long_standalone:
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(DX*1), X2
|
|
MOVOU -16(CX)(DX*1), X3
|
|
MOVQ DX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_standalonelarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_standalonelarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ DX, R8
|
|
JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_literal_end_standalone_skip:
|
|
XORQ BX, BX
|
|
|
|
emit_literal_end_standalone:
|
|
MOVQ BX, ret+48(FP)
|
|
RET
|
|
|
|
// func emitRepeat(dst []byte, length int) int
|
|
TEXT ·emitRepeat(SB), NOSPLIT, $0-40
|
|
XORQ DX, DX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ length+24(FP), CX
|
|
|
|
// emitRepeat
|
|
LEAL -1(CX), BX
|
|
CMPL CX, $0x1d
|
|
JBE repeat_one_standalone
|
|
LEAL -30(CX), BX
|
|
CMPL CX, $0x0000011e
|
|
JB repeat_two_standalone
|
|
CMPL CX, $0x0001001e
|
|
JB repeat_three_standalone
|
|
MOVB $0xfc, (AX)
|
|
MOVL BX, 1(AX)
|
|
ADDQ $0x04, DX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_three_standalone:
|
|
MOVB $0xf4, (AX)
|
|
MOVW BX, 1(AX)
|
|
ADDQ $0x03, DX
|
|
ADDQ $0x03, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_two_standalone:
|
|
MOVB $0xec, (AX)
|
|
MOVB BL, 1(AX)
|
|
ADDQ $0x02, DX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_one_standalone:
|
|
XORL BX, BX
|
|
LEAL -4(BX)(CX*8), BX
|
|
MOVB BL, (AX)
|
|
ADDQ $0x01, DX
|
|
ADDQ $0x01, AX
|
|
|
|
gen_emit_repeat_end:
|
|
MOVQ DX, ret+32(FP)
|
|
RET
|
|
|
|
// func emitCopy(dst []byte, offset int, length int) int
|
|
TEXT ·emitCopy(SB), NOSPLIT, $0-48
|
|
XORQ BX, BX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ offset+24(FP), CX
|
|
MOVQ length+32(FP), DX
|
|
|
|
// emitCopy
|
|
CMPL CX, $0x0001003f
|
|
JBE two_byte_offset_standalone
|
|
|
|
// emitCopy3
|
|
LEAL -4(DX), DX
|
|
LEAL -65536(CX), CX
|
|
SHLL $0x0b, CX
|
|
ADDL $0x07, CX
|
|
CMPL DX, $0x3c
|
|
JBE emit_copy3_0_standalone_emit3
|
|
LEAL -60(DX), SI
|
|
CMPL DX, $0x0000013c
|
|
JB emit_copy3_1_standalone_emit3
|
|
CMPL DX, $0x0001003c
|
|
JB emit_copy3_2_standalone_emit3
|
|
ADDL $0x000007e0, CX
|
|
MOVL CX, (AX)
|
|
MOVL SI, 4(AX)
|
|
ADDQ $0x07, BX
|
|
ADDQ $0x07, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy3_2_standalone_emit3:
|
|
ADDL $0x000007c0, CX
|
|
MOVL CX, (AX)
|
|
MOVW SI, 4(AX)
|
|
ADDQ $0x06, BX
|
|
ADDQ $0x06, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy3_1_standalone_emit3:
|
|
ADDL $0x000007a0, CX
|
|
MOVL CX, (AX)
|
|
MOVB SI, 4(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy3_0_standalone_emit3:
|
|
SHLL $0x05, DX
|
|
ORL DX, CX
|
|
MOVL CX, (AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
two_byte_offset_standalone:
|
|
CMPL CX, $0x00000400
|
|
JA two_byte_standalone
|
|
CMPL DX, $0x00000013
|
|
JAE emit_one_longer_standalone
|
|
LEAL -1(CX), CX
|
|
SHLL $0x06, CX
|
|
LEAL -15(CX)(DX*4), CX
|
|
MOVW CX, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_one_longer_standalone:
|
|
CMPL DX, $0x00000112
|
|
JAE emit_copy1_repeat_standalone
|
|
LEAL -1(CX), CX
|
|
SHLL $0x06, CX
|
|
LEAL 61(CX), CX
|
|
MOVW CX, (AX)
|
|
LEAL -18(DX), CX
|
|
MOVB CL, 2(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy1_repeat_standalone:
|
|
LEAL -1(CX), CX
|
|
SHLL $0x06, CX
|
|
LEAL 57(CX), CX
|
|
MOVW CX, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
SUBL $0x12, DX
|
|
|
|
// emitRepeat
|
|
LEAL -1(DX), CX
|
|
CMPL DX, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_standalone
|
|
LEAL -30(DX), CX
|
|
CMPL DX, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_standalone
|
|
CMPL DX, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_standalone
|
|
MOVB $0xfc, (AX)
|
|
MOVL CX, 1(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_three_emit_copy1_do_repeat_standalone:
|
|
MOVB $0xf4, (AX)
|
|
MOVW CX, 1(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_two_emit_copy1_do_repeat_standalone:
|
|
MOVB $0xec, (AX)
|
|
MOVB CL, 1(AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_one_emit_copy1_do_repeat_standalone:
|
|
XORL CX, CX
|
|
LEAL -4(CX)(DX*8), CX
|
|
MOVB CL, (AX)
|
|
ADDQ $0x01, BX
|
|
ADDQ $0x01, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
two_byte_standalone:
|
|
// emitCopy2
|
|
LEAL -64(CX), CX
|
|
LEAL -4(DX), DX
|
|
MOVW CX, 1(AX)
|
|
CMPL DX, $0x3c
|
|
JBE emit_copy2_0_standalone_emit2
|
|
LEAL -60(DX), CX
|
|
CMPL DX, $0x0000013c
|
|
JB emit_copy2_1_standalone_emit2
|
|
CMPL DX, $0x0001003c
|
|
JB emit_copy2_2_standalone_emit2
|
|
MOVB $0xfe, (AX)
|
|
MOVL CX, 3(AX)
|
|
ADDQ $0x06, BX
|
|
ADDQ $0x06, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy2_2_standalone_emit2:
|
|
MOVB $0xfa, (AX)
|
|
MOVW CX, 3(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy2_1_standalone_emit2:
|
|
MOVB $0xf6, (AX)
|
|
MOVB CL, 3(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy2_0_standalone_emit2:
|
|
MOVL $0x00000002, CX
|
|
LEAL (CX)(DX*4), CX
|
|
MOVB CL, (AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
|
|
gen_emit_copy_end:
|
|
MOVQ BX, ret+40(FP)
|
|
RET
|
|
|
|
// func emitCopyLits2(dst []byte, lits []byte, offset int, length int) int
|
|
// Requires: CMOV
|
|
TEXT ·emitCopyLits2(SB), NOSPLIT, $0-72
|
|
XORQ BX, BX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ lits_len+32(FP), SI
|
|
MOVQ offset+48(FP), CX
|
|
MOVQ length+56(FP), DX
|
|
CMPL DX, $0x0b
|
|
|
|
// emitCopy2WithLits
|
|
XORQ DI, DI
|
|
SUBL $0x40, CX
|
|
LEAL -11(DX), R8
|
|
LEAL -4(DX), DX
|
|
MOVW CX, 1(AX)
|
|
CMPL DX, $0x07
|
|
CMOVLGE R8, DI
|
|
MOVQ $0x00000007, CX
|
|
CMOVLLT DX, CX
|
|
LEAL -1(SI)(CX*4), CX
|
|
MOVL $0x00000003, DX
|
|
LEAL (DX)(CX*8), CX
|
|
MOVB CL, (AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
MOVQ lits_base+24(FP), CX
|
|
|
|
// genMemMoveVeryShort
|
|
CMPQ SI, $0x03
|
|
JE standalone_emitcopy2_lits_move_3
|
|
JA standalone_emitcopy2_lits_move_4
|
|
MOVB (CX), DL
|
|
MOVB -1(CX)(SI*1), CL
|
|
MOVB DL, (AX)
|
|
MOVB CL, -1(AX)(SI*1)
|
|
JMP standalone_emitcopy2_lits_end
|
|
|
|
standalone_emitcopy2_lits_move_3:
|
|
MOVW (CX), DX
|
|
MOVB 2(CX), CL
|
|
MOVW DX, (AX)
|
|
MOVB CL, 2(AX)
|
|
JMP standalone_emitcopy2_lits_end
|
|
|
|
standalone_emitcopy2_lits_move_4:
|
|
MOVL (CX), DX
|
|
MOVL DX, (AX)
|
|
|
|
standalone_emitcopy2_lits_end:
|
|
ADDQ SI, BX
|
|
ADDQ SI, AX
|
|
TESTL DI, DI
|
|
JZ standalone_emitcopy2_lits_done
|
|
|
|
// emitRepeat
|
|
LEAL -1(DI), CX
|
|
CMPL DI, $0x1d
|
|
JBE repeat_one_standalone_emitcopy2_lits
|
|
LEAL -30(DI), CX
|
|
CMPL DI, $0x0000011e
|
|
JB repeat_two_standalone_emitcopy2_lits
|
|
CMPL DI, $0x0001001e
|
|
JB repeat_three_standalone_emitcopy2_lits
|
|
MOVB $0xfc, (AX)
|
|
MOVL CX, 1(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP standalone_emitcopy2_lits_done
|
|
|
|
repeat_three_standalone_emitcopy2_lits:
|
|
MOVB $0xf4, (AX)
|
|
MOVW CX, 1(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP standalone_emitcopy2_lits_done
|
|
|
|
repeat_two_standalone_emitcopy2_lits:
|
|
MOVB $0xec, (AX)
|
|
MOVB CL, 1(AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP standalone_emitcopy2_lits_done
|
|
|
|
repeat_one_standalone_emitcopy2_lits:
|
|
XORL CX, CX
|
|
LEAL -4(CX)(DI*8), CX
|
|
MOVB CL, (AX)
|
|
ADDQ $0x01, BX
|
|
ADDQ $0x01, AX
|
|
|
|
standalone_emitcopy2_lits_done:
|
|
MOVQ BX, ret+64(FP)
|
|
RET
|
|
|
|
// func emitCopyLits3(dst []byte, lits []byte, offset int, length int) int
|
|
TEXT ·emitCopyLits3(SB), NOSPLIT, $0-72
|
|
XORQ BX, BX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ lits_len+32(FP), SI
|
|
MOVQ offset+48(FP), CX
|
|
MOVQ length+56(FP), DX
|
|
|
|
// emitCopy3
|
|
LEAL -4(DX), DX
|
|
LEAL -65536(CX), CX
|
|
SHLL $0x0b, CX
|
|
LEAL 7(CX)(SI*8), CX
|
|
CMPL DX, $0x3c
|
|
JBE emit_copy3_0_standalone_lits
|
|
LEAL -60(DX), DI
|
|
CMPL DX, $0x0000013c
|
|
JB emit_copy3_1_standalone_lits
|
|
CMPL DX, $0x0001003c
|
|
JB emit_copy3_2_standalone_lits
|
|
ADDL $0x000007e0, CX
|
|
MOVL CX, (AX)
|
|
MOVL DI, 4(AX)
|
|
ADDQ $0x07, BX
|
|
ADDQ $0x07, AX
|
|
JMP gen_emit_copy_lits_copylits
|
|
|
|
emit_copy3_2_standalone_lits:
|
|
ADDL $0x000007c0, CX
|
|
MOVL CX, (AX)
|
|
MOVW DI, 4(AX)
|
|
ADDQ $0x06, BX
|
|
ADDQ $0x06, AX
|
|
JMP gen_emit_copy_lits_copylits
|
|
|
|
emit_copy3_1_standalone_lits:
|
|
ADDL $0x000007a0, CX
|
|
MOVL CX, (AX)
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_lits_copylits
|
|
|
|
emit_copy3_0_standalone_lits:
|
|
SHLL $0x05, DX
|
|
ORL DX, CX
|
|
MOVL CX, (AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
|
|
gen_emit_copy_lits_copylits:
|
|
MOVQ lits_base+24(FP), CX
|
|
|
|
// genMemMoveVeryShort
|
|
CMPQ SI, $0x03
|
|
JE standalone_emitcopy3_lits_move_3
|
|
MOVB (CX), DL
|
|
MOVB -1(CX)(SI*1), CL
|
|
MOVB DL, (AX)
|
|
MOVB CL, -1(AX)(SI*1)
|
|
JMP standalone_emitcopy3_lits_end
|
|
|
|
standalone_emitcopy3_lits_move_3:
|
|
MOVW (CX), DX
|
|
MOVB 2(CX), CL
|
|
MOVW DX, (AX)
|
|
MOVB CL, 2(AX)
|
|
|
|
standalone_emitcopy3_lits_end:
|
|
ADDQ SI, BX
|
|
MOVQ BX, ret+64(FP)
|
|
RET
|
|
|
|
// func matchLen(a []byte, b []byte) int
|
|
// Requires: BMI
|
|
TEXT ·matchLen(SB), NOSPLIT, $0-56
|
|
MOVQ a_base+0(FP), AX
|
|
MOVQ b_base+24(FP), CX
|
|
MOVQ a_len+8(FP), DX
|
|
|
|
// matchLen
|
|
XORL SI, SI
|
|
JMP matchlen_loop_16_entry_standalone
|
|
|
|
matchlen_loopback_16_standalone:
|
|
MOVQ (AX)(SI*1), BX
|
|
MOVQ 8(AX)(SI*1), DI
|
|
XORQ (CX)(SI*1), BX
|
|
JNZ matchlen_bsf_8_standalone
|
|
XORQ 8(CX)(SI*1), DI
|
|
JNZ matchlen_bsf_16standalone
|
|
LEAL -16(DX), DX
|
|
LEAL 16(SI), SI
|
|
|
|
matchlen_loop_16_entry_standalone:
|
|
CMPL DX, $0x10
|
|
JAE matchlen_loopback_16_standalone
|
|
JMP matchlen_match8_standalone
|
|
|
|
matchlen_bsf_16standalone:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ DI, DI
|
|
|
|
#else
|
|
BSFQ DI, DI
|
|
|
|
#endif
|
|
SARQ $0x03, DI
|
|
LEAL 8(SI)(DI*1), SI
|
|
JMP gen_match_len_end
|
|
|
|
matchlen_match8_standalone:
|
|
CMPL DX, $0x08
|
|
JB matchlen_match4_standalone
|
|
MOVQ (AX)(SI*1), BX
|
|
XORQ (CX)(SI*1), BX
|
|
JNZ matchlen_bsf_8_standalone
|
|
LEAL -8(DX), DX
|
|
LEAL 8(SI), SI
|
|
JMP matchlen_match4_standalone
|
|
|
|
matchlen_bsf_8_standalone:
|
|
#ifdef GOAMD64_v3
|
|
TZCNTQ BX, BX
|
|
|
|
#else
|
|
BSFQ BX, BX
|
|
|
|
#endif
|
|
SARQ $0x03, BX
|
|
LEAL (SI)(BX*1), SI
|
|
JMP gen_match_len_end
|
|
|
|
matchlen_match4_standalone:
|
|
CMPL DX, $0x04
|
|
JB matchlen_match2_standalone
|
|
MOVL (AX)(SI*1), BX
|
|
CMPL (CX)(SI*1), BX
|
|
JNE matchlen_match2_standalone
|
|
LEAL -4(DX), DX
|
|
LEAL 4(SI), SI
|
|
|
|
matchlen_match2_standalone:
|
|
CMPL DX, $0x01
|
|
JE matchlen_match1_standalone
|
|
JB gen_match_len_end
|
|
MOVW (AX)(SI*1), BX
|
|
CMPW (CX)(SI*1), BX
|
|
JNE matchlen_match1_standalone
|
|
LEAL 2(SI), SI
|
|
SUBL $0x02, DX
|
|
JZ gen_match_len_end
|
|
|
|
matchlen_match1_standalone:
|
|
MOVB (AX)(SI*1), BL
|
|
CMPB (CX)(SI*1), BL
|
|
JNE gen_match_len_end
|
|
LEAL 1(SI), SI
|
|
|
|
gen_match_len_end:
|
|
MOVQ SI, ret+48(FP)
|
|
RET
|
|
|
|
// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
|
|
// Requires: CMOV, SSE2
|
|
TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $8-64
|
|
XORQ SI, SI
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ dst_len+8(FP), CX
|
|
MOVQ src_base+24(FP), DX
|
|
MOVQ src_len+32(FP), BX
|
|
LEAQ (DX)(BX*1), BX
|
|
LEAQ -12(AX)(CX*1), CX
|
|
MOVL $0x00000001, (SP)
|
|
|
|
lz4_mz_loop:
|
|
CMPQ DX, BX
|
|
JAE lz4_mz_corrupt
|
|
CMPQ AX, CX
|
|
JAE lz4_mz_dstfull
|
|
MOVBQZX (DX), DI
|
|
MOVQ DI, R8
|
|
MOVQ DI, R9
|
|
ANDQ $0x0f, R9
|
|
XORQ R10, R10
|
|
SHRQ $0x04, R8
|
|
CMPQ DI, $0x50
|
|
CMOVQLT R8, R10
|
|
JLT lz4_mz_ll_end
|
|
CMPQ DI, $0xf0
|
|
JB lz4_mz_ll_end
|
|
|
|
lz4_mz_ll_loop:
|
|
INCQ DX
|
|
CMPQ DX, BX
|
|
JAE lz4_mz_corrupt
|
|
MOVBQZX (DX), DI
|
|
ADDQ DI, R8
|
|
CMPQ DI, $0xff
|
|
JEQ lz4_mz_ll_loop
|
|
|
|
lz4_mz_ll_end:
|
|
LEAQ (DX)(R8*1), DI
|
|
ADDQ $0x04, R9
|
|
CMPQ DI, BX
|
|
JAE lz4_mz_corrupt
|
|
INCQ DX
|
|
INCQ DI
|
|
TESTQ R8, R8
|
|
JZ lz4_mz_lits_done
|
|
TESTQ R10, R10
|
|
JNZ lz4_mz_lits_done
|
|
LEAQ (AX)(R8*1), R11
|
|
CMPQ R11, CX
|
|
JAE lz4_mz_dstfull
|
|
|
|
// emitLiteral
|
|
LEAL -1(R8), R11
|
|
CMPL R11, $0x1d
|
|
JB one_byte_lz4_mz
|
|
SUBL $0x1d, R11
|
|
CMPL R11, $0x00000100
|
|
JB two_bytes_lz4_mz
|
|
CMPL R11, $0x00010000
|
|
JB three_bytes_lz4_mz
|
|
MOVL R11, R12
|
|
SHRL $0x10, R12
|
|
MOVB $0xf8, (AX)
|
|
MOVW R11, 1(AX)
|
|
MOVB R12, 3(AX)
|
|
ADDQ $0x04, AX
|
|
ADDL $0x1d, R11
|
|
JMP memmove_long_lz4_mz
|
|
|
|
three_bytes_lz4_mz:
|
|
MOVB $0xf0, (AX)
|
|
MOVW R11, 1(AX)
|
|
ADDQ $0x03, AX
|
|
ADDL $0x1d, R11
|
|
JMP memmove_long_lz4_mz
|
|
|
|
two_bytes_lz4_mz:
|
|
MOVB $0xe8, (AX)
|
|
MOVB R11, 1(AX)
|
|
ADDL $0x1d, R11
|
|
ADDQ $0x02, AX
|
|
CMPL R11, $0x40
|
|
JB memmove_midlz4_mz
|
|
JMP memmove_long_lz4_mz
|
|
|
|
one_byte_lz4_mz:
|
|
SHLB $0x03, R11
|
|
MOVB R11, (AX)
|
|
ADDQ $0x01, AX
|
|
LEAQ (AX)(R8*1), R11
|
|
MOVL R8, R12
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ R12, $0x03
|
|
JB emit_lit_memmove_lz4_mz_memmove_move_1or2
|
|
JE emit_lit_memmove_lz4_mz_memmove_move_3
|
|
CMPQ R12, $0x08
|
|
JBE emit_lit_memmove_lz4_mz_memmove_move_4through8
|
|
CMPQ R12, $0x10
|
|
JBE emit_lit_memmove_lz4_mz_memmove_move_8through16
|
|
CMPQ R12, $0x20
|
|
JBE emit_lit_memmove_lz4_mz_memmove_move_17through32
|
|
JMP emit_lit_memmove_lz4_mz_memmove_move_33through64
|
|
|
|
emit_lit_memmove_lz4_mz_memmove_move_1or2:
|
|
MOVB (DX), R13
|
|
MOVB -1(DX)(R12*1), R14
|
|
MOVB R13, (AX)
|
|
MOVB R14, -1(AX)(R12*1)
|
|
JMP memmove_end_copy_lz4_mz
|
|
|
|
emit_lit_memmove_lz4_mz_memmove_move_3:
|
|
MOVW (DX), R13
|
|
MOVB 2(DX), R14
|
|
MOVW R13, (AX)
|
|
MOVB R14, 2(AX)
|
|
JMP memmove_end_copy_lz4_mz
|
|
|
|
emit_lit_memmove_lz4_mz_memmove_move_4through8:
|
|
MOVL (DX), R13
|
|
MOVL -4(DX)(R12*1), R14
|
|
MOVL R13, (AX)
|
|
MOVL R14, -4(AX)(R12*1)
|
|
JMP memmove_end_copy_lz4_mz
|
|
|
|
emit_lit_memmove_lz4_mz_memmove_move_8through16:
|
|
MOVQ (DX), R13
|
|
MOVQ -8(DX)(R12*1), R14
|
|
MOVQ R13, (AX)
|
|
MOVQ R14, -8(AX)(R12*1)
|
|
JMP memmove_end_copy_lz4_mz
|
|
|
|
emit_lit_memmove_lz4_mz_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(R12*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R12*1)
|
|
JMP memmove_end_copy_lz4_mz
|
|
|
|
emit_lit_memmove_lz4_mz_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(R12*1), X2
|
|
MOVOU -16(DX)(R12*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R12*1)
|
|
MOVOU X3, -16(AX)(R12*1)
|
|
|
|
memmove_end_copy_lz4_mz:
|
|
MOVQ R11, AX
|
|
JMP lz4_mz_lits_emit_done
|
|
|
|
memmove_midlz4_mz:
|
|
LEAQ (AX)(R8*1), R11
|
|
MOVL R8, R12
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ R12, $0x20
|
|
JBE emit_lit_memmove_mid_lz4_mz_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_lz4_mz_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_lz4_mz_memmove_move_17through32:
|
|
MOVOU (DX), X0
|
|
MOVOU -16(DX)(R12*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R12*1)
|
|
JMP memmove_mid_end_copy_lz4_mz
|
|
|
|
emit_lit_memmove_mid_lz4_mz_memmove_move_33through64:
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(R12*1), X2
|
|
MOVOU -16(DX)(R12*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R12*1)
|
|
MOVOU X3, -16(AX)(R12*1)
|
|
|
|
memmove_mid_end_copy_lz4_mz:
|
|
MOVQ R11, AX
|
|
JMP lz4_mz_lits_emit_done
|
|
|
|
memmove_long_lz4_mz:
|
|
LEAQ (AX)(R8*1), R11
|
|
MOVL R8, R12
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DX), X0
|
|
MOVOU 16(DX), X1
|
|
MOVOU -32(DX)(R12*1), X2
|
|
MOVOU -16(DX)(R12*1), X3
|
|
MOVQ R12, R14
|
|
SHRQ $0x05, R14
|
|
MOVQ AX, R13
|
|
ANDL $0x0000001f, R13
|
|
MOVQ $0x00000040, R15
|
|
SUBQ R13, R15
|
|
DECQ R14
|
|
JA emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32
|
|
LEAQ -32(DX)(R15*1), R13
|
|
LEAQ -32(AX)(R15*1), BP
|
|
|
|
emit_lit_memmove_long_lz4_mzlarge_big_loop_back:
|
|
MOVOU (R13), X4
|
|
MOVOU 16(R13), X5
|
|
MOVOA X4, (BP)
|
|
MOVOA X5, 16(BP)
|
|
ADDQ $0x20, BP
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R15
|
|
DECQ R14
|
|
JNA emit_lit_memmove_long_lz4_mzlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32:
|
|
MOVOU -32(DX)(R15*1), X4
|
|
MOVOU -16(DX)(R15*1), X5
|
|
MOVOA X4, -32(AX)(R15*1)
|
|
MOVOA X5, -16(AX)(R15*1)
|
|
ADDQ $0x20, R15
|
|
CMPQ R12, R15
|
|
JAE emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R12*1)
|
|
MOVOU X3, -16(AX)(R12*1)
|
|
MOVQ R11, AX
|
|
|
|
lz4_mz_lits_emit_done:
|
|
lz4_mz_lits_done:
|
|
ADDQ R8, SI
|
|
MOVQ DI, R8
|
|
MOVQ DX, DI
|
|
MOVQ R8, DX
|
|
CMPQ DX, BX
|
|
JNE lz4_mz_match
|
|
CMPQ R9, $0x04
|
|
JNE lz4_mz_corrupt
|
|
TESTQ R10, R10
|
|
JNZ lz4_mz_emit_final
|
|
JMP lz4_mz_done
|
|
|
|
lz4_mz_match:
|
|
ADDQ $0x02, DX
|
|
CMPQ DX, BX
|
|
JAE lz4_mz_corrupt
|
|
MOVWQZX -2(DX), R8
|
|
TESTQ R8, R8
|
|
JZ lz4_mz_corrupt
|
|
CMPQ R8, SI
|
|
JA lz4_mz_corrupt
|
|
CMPQ R9, $0x13
|
|
JNE lz4_mz_ml_done
|
|
|
|
lz4_mz_ml_loop:
|
|
MOVBQZX (DX), R11
|
|
INCQ DX
|
|
ADDQ R11, R9
|
|
CMPQ DX, BX
|
|
JAE lz4_mz_corrupt
|
|
CMPQ R11, $0xff
|
|
JEQ lz4_mz_ml_loop
|
|
|
|
lz4_mz_ml_done:
|
|
ADDQ R9, SI
|
|
TESTQ R10, R10
|
|
JNZ lz4_mz_dofuse
|
|
CMPQ (SP), R8
|
|
JNE lz4_mz_docopy
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), DI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_lz4_mz
|
|
LEAL -30(R9), DI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_lz4_mz
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_lz4_mz
|
|
MOVB $0xfc, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x04, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_three_lz4_mz:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_two_lz4_mz:
|
|
MOVB $0xec, (AX)
|
|
MOVB DI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_one_lz4_mz:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(R9*8), DI
|
|
MOVB DI, (AX)
|
|
ADDQ $0x01, AX
|
|
JMP lz4_mz_loop
|
|
|
|
lz4_mz_dofuse:
|
|
MOVQ R8, (SP)
|
|
CMPQ R8, $0x40
|
|
JB lz4_mz_doemitcopy
|
|
|
|
// emitCopy2WithLits
|
|
XORQ R11, R11
|
|
SUBL $0x40, R8
|
|
LEAL -11(R9), R12
|
|
LEAL -4(R9), R9
|
|
MOVW R8, 1(AX)
|
|
CMPL R9, $0x07
|
|
CMOVLGE R12, R11
|
|
MOVQ $0x00000007, R8
|
|
CMOVLLT R9, R8
|
|
LEAL -1(R10)(R8*4), R8
|
|
MOVL $0x00000003, R9
|
|
LEAL (R9)(R8*8), R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x03, AX
|
|
MOVL (DI), DI
|
|
MOVL DI, (AX)
|
|
ADDQ R10, AX
|
|
TESTL R11, R11
|
|
JZ lz4_mz_loop
|
|
|
|
// emitRepeat
|
|
LEAL -1(R11), DI
|
|
CMPL R11, $0x1d
|
|
JBE repeat_one_fused_emitrep_lz4_mz_
|
|
LEAL -30(R11), DI
|
|
CMPL R11, $0x0000011e
|
|
JB repeat_two_fused_emitrep_lz4_mz_
|
|
CMPL R11, $0x0001001e
|
|
JB repeat_three_fused_emitrep_lz4_mz_
|
|
MOVB $0xfc, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x04, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_three_fused_emitrep_lz4_mz_:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_two_fused_emitrep_lz4_mz_:
|
|
MOVB $0xec, (AX)
|
|
MOVB DI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_one_fused_emitrep_lz4_mz_:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(R11*8), DI
|
|
MOVB DI, (AX)
|
|
ADDQ $0x01, AX
|
|
JMP lz4_mz_loop
|
|
|
|
lz4_mz_doemitcopy:
|
|
// emitLiteral
|
|
LEAL -1(R10), R11
|
|
CMPL R11, $0x1d
|
|
JB one_byte_lz4_mz_emitcopy
|
|
SUBL $0x1d, R11
|
|
CMPL R11, $0x00000100
|
|
JB two_bytes_lz4_mz_emitcopy
|
|
CMPL R11, $0x00010000
|
|
JB three_bytes_lz4_mz_emitcopy
|
|
MOVL R11, R12
|
|
SHRL $0x10, R12
|
|
MOVB $0xf8, (AX)
|
|
MOVW R11, 1(AX)
|
|
MOVB R12, 3(AX)
|
|
ADDQ $0x04, AX
|
|
ADDL $0x1d, R11
|
|
JMP memmove_long_lz4_mz_emitcopy
|
|
|
|
three_bytes_lz4_mz_emitcopy:
|
|
MOVB $0xf0, (AX)
|
|
MOVW R11, 1(AX)
|
|
ADDQ $0x03, AX
|
|
ADDL $0x1d, R11
|
|
JMP memmove_long_lz4_mz_emitcopy
|
|
|
|
two_bytes_lz4_mz_emitcopy:
|
|
MOVB $0xe8, (AX)
|
|
MOVB R11, 1(AX)
|
|
ADDL $0x1d, R11
|
|
ADDQ $0x02, AX
|
|
CMPL R11, $0x40
|
|
JB memmove_midlz4_mz_emitcopy
|
|
JMP memmove_long_lz4_mz_emitcopy
|
|
|
|
one_byte_lz4_mz_emitcopy:
|
|
SHLB $0x03, R11
|
|
MOVB R11, (AX)
|
|
ADDQ $0x01, AX
|
|
LEAQ (AX)(R10*1), R11
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ R10, $0x03
|
|
JB emit_lit_memmove_lz4_mz_emitcopy_memmove_move_1or2
|
|
JE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_3
|
|
CMPQ R10, $0x08
|
|
JBE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8
|
|
CMPQ R10, $0x10
|
|
JBE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16
|
|
CMPQ R10, $0x20
|
|
JBE emit_lit_memmove_lz4_mz_emitcopy_memmove_move_17through32
|
|
JMP emit_lit_memmove_lz4_mz_emitcopy_memmove_move_33through64
|
|
|
|
emit_lit_memmove_lz4_mz_emitcopy_memmove_move_1or2:
|
|
MOVB (DI), R12
|
|
MOVB -1(DI)(R10*1), DI
|
|
MOVB R12, (AX)
|
|
MOVB DI, -1(AX)(R10*1)
|
|
JMP memmove_end_copy_lz4_mz_emitcopy
|
|
|
|
emit_lit_memmove_lz4_mz_emitcopy_memmove_move_3:
|
|
MOVW (DI), R12
|
|
MOVB 2(DI), DI
|
|
MOVW R12, (AX)
|
|
MOVB DI, 2(AX)
|
|
JMP memmove_end_copy_lz4_mz_emitcopy
|
|
|
|
emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8:
|
|
MOVL (DI), R12
|
|
MOVL -4(DI)(R10*1), DI
|
|
MOVL R12, (AX)
|
|
MOVL DI, -4(AX)(R10*1)
|
|
JMP memmove_end_copy_lz4_mz_emitcopy
|
|
|
|
emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16:
|
|
MOVQ (DI), R12
|
|
MOVQ -8(DI)(R10*1), DI
|
|
MOVQ R12, (AX)
|
|
MOVQ DI, -8(AX)(R10*1)
|
|
JMP memmove_end_copy_lz4_mz_emitcopy
|
|
|
|
emit_lit_memmove_lz4_mz_emitcopy_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R10*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R10*1)
|
|
JMP memmove_end_copy_lz4_mz_emitcopy
|
|
|
|
emit_lit_memmove_lz4_mz_emitcopy_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R10*1), X2
|
|
MOVOU -16(DI)(R10*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R10*1)
|
|
MOVOU X3, -16(AX)(R10*1)
|
|
|
|
memmove_end_copy_lz4_mz_emitcopy:
|
|
MOVQ R11, AX
|
|
JMP lz4_mz__emit_done
|
|
|
|
memmove_midlz4_mz_emitcopy:
|
|
LEAQ (AX)(R10*1), R11
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ R10, $0x20
|
|
JBE emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R10*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R10*1)
|
|
JMP memmove_mid_end_copy_lz4_mz_emitcopy
|
|
|
|
emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R10*1), X2
|
|
MOVOU -16(DI)(R10*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R10*1)
|
|
MOVOU X3, -16(AX)(R10*1)
|
|
|
|
memmove_mid_end_copy_lz4_mz_emitcopy:
|
|
MOVQ R11, AX
|
|
JMP lz4_mz__emit_done
|
|
|
|
memmove_long_lz4_mz_emitcopy:
|
|
LEAQ (AX)(R10*1), R11
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R10*1), X2
|
|
MOVOU -16(DI)(R10*1), X3
|
|
MOVQ R10, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R12
|
|
ANDL $0x0000001f, R12
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R12, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R14*1), R12
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back:
|
|
MOVOU (R12), X4
|
|
MOVOU 16(R12), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R14*1), X4
|
|
MOVOU -16(DI)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R10, R14
|
|
JAE emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R10*1)
|
|
MOVOU X3, -16(AX)(R10*1)
|
|
MOVQ R11, AX
|
|
|
|
lz4_mz__emit_done:
|
|
// emitCopy
|
|
CMPL R8, $0x00000400
|
|
JA two_byte_lz4_mz__lz4_mz_short_
|
|
CMPL R9, $0x00000013
|
|
JAE emit_one_longer_lz4_mz__lz4_mz_short_
|
|
LEAL -1(R8), DI
|
|
SHLL $0x06, DI
|
|
LEAL -15(DI)(R9*4), DI
|
|
MOVW DI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_one_longer_lz4_mz__lz4_mz_short_:
|
|
CMPL R9, $0x00000112
|
|
JAE emit_copy1_repeat_lz4_mz__lz4_mz_short_
|
|
LEAL -1(R8), DI
|
|
SHLL $0x06, DI
|
|
LEAL 61(DI), DI
|
|
MOVW DI, (AX)
|
|
LEAL -18(R9), DI
|
|
MOVB DI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy1_repeat_lz4_mz__lz4_mz_short_:
|
|
LEAL -1(R8), DI
|
|
SHLL $0x06, DI
|
|
LEAL 57(DI), DI
|
|
MOVW DI, (AX)
|
|
ADDQ $0x02, AX
|
|
SUBL $0x12, R9
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), DI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_
|
|
LEAL -30(R9), DI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_
|
|
MOVB $0xfc, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x04, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_:
|
|
MOVB $0xec, (AX)
|
|
MOVB DI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(R9*8), DI
|
|
MOVB DI, (AX)
|
|
ADDQ $0x01, AX
|
|
JMP lz4_mz_loop
|
|
|
|
two_byte_lz4_mz__lz4_mz_short_:
|
|
// emitCopy2
|
|
LEAL -64(R8), R8
|
|
LEAL -4(R9), R9
|
|
MOVW R8, 1(AX)
|
|
CMPL R9, $0x3c
|
|
JBE emit_copy2_0_lz4_mz__lz4_mz_short__emit2
|
|
LEAL -60(R9), DI
|
|
CMPL R9, $0x0000013c
|
|
JB emit_copy2_1_lz4_mz__lz4_mz_short__emit2
|
|
CMPL R9, $0x0001003c
|
|
JB emit_copy2_2_lz4_mz__lz4_mz_short__emit2
|
|
MOVB $0xfe, (AX)
|
|
MOVL DI, 3(AX)
|
|
ADDQ $0x06, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy2_2_lz4_mz__lz4_mz_short__emit2:
|
|
MOVB $0xfa, (AX)
|
|
MOVW DI, 3(AX)
|
|
ADDQ $0x05, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy2_1_lz4_mz__lz4_mz_short__emit2:
|
|
MOVB $0xf6, (AX)
|
|
MOVB DI, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy2_0_lz4_mz__lz4_mz_short__emit2:
|
|
MOVL $0x00000002, DI
|
|
LEAL (DI)(R9*4), DI
|
|
MOVB DI, (AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
lz4_mz_docopy:
|
|
MOVQ R8, (SP)
|
|
|
|
// emitCopy
|
|
CMPL R8, $0x00000400
|
|
JA two_byte_lz4_mz__lz4_mz
|
|
CMPL R9, $0x00000013
|
|
JAE emit_one_longer_lz4_mz__lz4_mz
|
|
LEAL -1(R8), DI
|
|
SHLL $0x06, DI
|
|
LEAL -15(DI)(R9*4), DI
|
|
MOVW DI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_one_longer_lz4_mz__lz4_mz:
|
|
CMPL R9, $0x00000112
|
|
JAE emit_copy1_repeat_lz4_mz__lz4_mz
|
|
LEAL -1(R8), DI
|
|
SHLL $0x06, DI
|
|
LEAL 61(DI), DI
|
|
MOVW DI, (AX)
|
|
LEAL -18(R9), DI
|
|
MOVB DI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy1_repeat_lz4_mz__lz4_mz:
|
|
LEAL -1(R8), DI
|
|
SHLL $0x06, DI
|
|
LEAL 57(DI), DI
|
|
MOVW DI, (AX)
|
|
ADDQ $0x02, AX
|
|
SUBL $0x12, R9
|
|
|
|
// emitRepeat
|
|
LEAL -1(R9), DI
|
|
CMPL R9, $0x1d
|
|
JBE repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz
|
|
LEAL -30(R9), DI
|
|
CMPL R9, $0x0000011e
|
|
JB repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz
|
|
CMPL R9, $0x0001001e
|
|
JB repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz
|
|
MOVB $0xfc, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x04, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz:
|
|
MOVB $0xec, (AX)
|
|
MOVB DI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
JMP lz4_mz_loop
|
|
|
|
repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz:
|
|
XORL DI, DI
|
|
LEAL -4(DI)(R9*8), DI
|
|
MOVB DI, (AX)
|
|
ADDQ $0x01, AX
|
|
JMP lz4_mz_loop
|
|
|
|
two_byte_lz4_mz__lz4_mz:
|
|
// emitCopy2
|
|
LEAL -64(R8), R8
|
|
LEAL -4(R9), R9
|
|
MOVW R8, 1(AX)
|
|
CMPL R9, $0x3c
|
|
JBE emit_copy2_0_lz4_mz__lz4_mz_emit2
|
|
LEAL -60(R9), DI
|
|
CMPL R9, $0x0000013c
|
|
JB emit_copy2_1_lz4_mz__lz4_mz_emit2
|
|
CMPL R9, $0x0001003c
|
|
JB emit_copy2_2_lz4_mz__lz4_mz_emit2
|
|
MOVB $0xfe, (AX)
|
|
MOVL DI, 3(AX)
|
|
ADDQ $0x06, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy2_2_lz4_mz__lz4_mz_emit2:
|
|
MOVB $0xfa, (AX)
|
|
MOVW DI, 3(AX)
|
|
ADDQ $0x05, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy2_1_lz4_mz__lz4_mz_emit2:
|
|
MOVB $0xf6, (AX)
|
|
MOVB DI, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP lz4_mz_loop
|
|
|
|
emit_copy2_0_lz4_mz__lz4_mz_emit2:
|
|
MOVL $0x00000002, DI
|
|
LEAL (DI)(R9*4), DI
|
|
MOVB DI, (AX)
|
|
ADDQ $0x03, AX
|
|
JMP lz4_mz_loop
|
|
|
|
lz4_mz_emit_final:
|
|
// emitLiteral
|
|
LEAL -1(R10), CX
|
|
CMPL CX, $0x1d
|
|
JB one_byte_lz4_mz_emit_final
|
|
SUBL $0x1d, CX
|
|
CMPL CX, $0x00000100
|
|
JB two_bytes_lz4_mz_emit_final
|
|
CMPL CX, $0x00010000
|
|
JB three_bytes_lz4_mz_emit_final
|
|
MOVL CX, DX
|
|
SHRL $0x10, DX
|
|
MOVB $0xf8, (AX)
|
|
MOVW CX, 1(AX)
|
|
MOVB DL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
ADDL $0x1d, CX
|
|
JMP memmove_long_lz4_mz_emit_final
|
|
|
|
three_bytes_lz4_mz_emit_final:
|
|
MOVB $0xf0, (AX)
|
|
MOVW CX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
ADDL $0x1d, CX
|
|
JMP memmove_long_lz4_mz_emit_final
|
|
|
|
two_bytes_lz4_mz_emit_final:
|
|
MOVB $0xe8, (AX)
|
|
MOVB CL, 1(AX)
|
|
ADDL $0x1d, CX
|
|
ADDQ $0x02, AX
|
|
CMPL CX, $0x40
|
|
JB memmove_midlz4_mz_emit_final
|
|
JMP memmove_long_lz4_mz_emit_final
|
|
|
|
one_byte_lz4_mz_emit_final:
|
|
SHLB $0x03, CL
|
|
MOVB CL, (AX)
|
|
ADDQ $0x01, AX
|
|
LEAQ (AX)(R10*1), CX
|
|
MOVL R10, DX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 1
|
|
CMPQ DX, $0x03
|
|
JB emit_lit_memmove_lz4_mz_emit_final_memmove_move_1or2
|
|
JE emit_lit_memmove_lz4_mz_emit_final_memmove_move_3
|
|
CMPQ DX, $0x08
|
|
JBE emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8
|
|
CMPQ DX, $0x10
|
|
JBE emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16
|
|
CMPQ DX, $0x20
|
|
JBE emit_lit_memmove_lz4_mz_emit_final_memmove_move_17through32
|
|
JMP emit_lit_memmove_lz4_mz_emit_final_memmove_move_33through64
|
|
|
|
emit_lit_memmove_lz4_mz_emit_final_memmove_move_1or2:
|
|
MOVB (DI), BL
|
|
MOVB -1(DI)(DX*1), DI
|
|
MOVB BL, (AX)
|
|
MOVB DI, -1(AX)(DX*1)
|
|
JMP memmove_end_copy_lz4_mz_emit_final
|
|
|
|
emit_lit_memmove_lz4_mz_emit_final_memmove_move_3:
|
|
MOVW (DI), BX
|
|
MOVB 2(DI), DI
|
|
MOVW BX, (AX)
|
|
MOVB DI, 2(AX)
|
|
JMP memmove_end_copy_lz4_mz_emit_final
|
|
|
|
emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8:
|
|
MOVL (DI), BX
|
|
MOVL -4(DI)(DX*1), DI
|
|
MOVL BX, (AX)
|
|
MOVL DI, -4(AX)(DX*1)
|
|
JMP memmove_end_copy_lz4_mz_emit_final
|
|
|
|
emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16:
|
|
MOVQ (DI), BX
|
|
MOVQ -8(DI)(DX*1), DI
|
|
MOVQ BX, (AX)
|
|
MOVQ DI, -8(AX)(DX*1)
|
|
JMP memmove_end_copy_lz4_mz_emit_final
|
|
|
|
emit_lit_memmove_lz4_mz_emit_final_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(DX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(DX*1)
|
|
JMP memmove_end_copy_lz4_mz_emit_final
|
|
|
|
emit_lit_memmove_lz4_mz_emit_final_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(DX*1), X2
|
|
MOVOU -16(DI)(DX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
|
|
memmove_end_copy_lz4_mz_emit_final:
|
|
MOVQ CX, AX
|
|
JMP lz4_mz_done
|
|
|
|
memmove_midlz4_mz_emit_final:
|
|
LEAQ (AX)(R10*1), CX
|
|
MOVL R10, DX
|
|
|
|
// genMemMoveShort
|
|
// margin: 0, min move: 30
|
|
CMPQ DX, $0x20
|
|
JBE emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_17through32
|
|
JMP emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_33through64
|
|
|
|
emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(DX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(DX*1)
|
|
JMP memmove_mid_end_copy_lz4_mz_emit_final
|
|
|
|
emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(DX*1), X2
|
|
MOVOU -16(DI)(DX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
|
|
memmove_mid_end_copy_lz4_mz_emit_final:
|
|
MOVQ CX, AX
|
|
JMP lz4_mz_done
|
|
|
|
memmove_long_lz4_mz_emit_final:
|
|
LEAQ (AX)(R10*1), CX
|
|
MOVL R10, DX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(DX*1), X2
|
|
MOVOU -16(DI)(DX*1), X3
|
|
MOVQ DX, R8
|
|
SHRQ $0x05, R8
|
|
MOVQ AX, BX
|
|
ANDL $0x0000001f, BX
|
|
MOVQ $0x00000040, R9
|
|
SUBQ BX, R9
|
|
DECQ R8
|
|
JA emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R9*1), BX
|
|
LEAQ -32(AX)(R9*1), R10
|
|
|
|
emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back:
|
|
MOVOU (BX), X4
|
|
MOVOU 16(BX), X5
|
|
MOVOA X4, (R10)
|
|
MOVOA X5, 16(R10)
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, BX
|
|
ADDQ $0x20, R9
|
|
DECQ R8
|
|
JNA emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R9*1), X4
|
|
MOVOU -16(DI)(R9*1), X5
|
|
MOVOA X4, -32(AX)(R9*1)
|
|
MOVOA X5, -16(AX)(R9*1)
|
|
ADDQ $0x20, R9
|
|
CMPQ DX, R9
|
|
JAE emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
MOVQ CX, AX
|
|
|
|
lz4_mz_done:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ SI, uncompressed+48(FP)
|
|
MOVQ AX, dstUsed+56(FP)
|
|
RET
|
|
|
|
lz4_mz_corrupt:
|
|
XORQ AX, AX
|
|
LEAQ -1(AX), SI
|
|
MOVQ SI, uncompressed+48(FP)
|
|
RET
|
|
|
|
lz4_mz_dstfull:
|
|
XORQ AX, AX
|
|
LEAQ -2(AX), SI
|
|
MOVQ SI, uncompressed+48(FP)
|
|
RET
|
|
|
|
// func decodeBlockAsm(dst []byte, src []byte) int
|
|
// Requires: CMOV, SSE2
|
|
TEXT ·decodeBlockAsm(SB), $8-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ dst_len+8(FP), CX
|
|
MOVQ src_base+24(FP), DX
|
|
MOVQ src_len+32(FP), BX
|
|
MOVQ AX, SI
|
|
XORQ DI, DI
|
|
MOVQ DX, R8
|
|
MOVQ $0x00000001, R9
|
|
LEAQ (AX)(CX*1), AX
|
|
LEAQ (DX)(BX*1), CX
|
|
LEAQ -20(CX), DX
|
|
LEAQ -20(AX), BX
|
|
CMPQ R8, DX
|
|
JAE decodeBlockAsm_fast_end_copy
|
|
MOVBQZX (R8), R10
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
|
|
decodeBlockAsm_fast_loop_nofetch:
|
|
CMPQ SI, BX
|
|
JAE decodeBlockAsm_fast_end_copy
|
|
ANDQ $0x03, R10
|
|
JNZ decodeBlockAsm_fast_copy
|
|
|
|
decodeBlockAsm_fast_lits:
|
|
MOVL R11, R12
|
|
SHRL $0x01, R12
|
|
CMPL R12, $0x1d
|
|
JB decodeBlockAsm_fast_lit_0
|
|
JEQ decodeBlockAsm_fast_lit_1
|
|
CMPL R12, $0x1e
|
|
JEQ decodeBlockAsm_fast_lit_2
|
|
JMP decodeBlockAsm_fast_lit_3
|
|
|
|
decodeBlockAsm_fast_lit_0:
|
|
INCQ R8
|
|
INCL R12
|
|
LEAQ (SI)(R12*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
BTL $0x00, R11
|
|
JC decodeBlockAsm_fast_copy_exec_short
|
|
LEAQ (R8)(R12*1), R10
|
|
CMPQ R10, CX
|
|
JA corrupt
|
|
|
|
// genMemMoveShort
|
|
// margin: 19, min move: 1
|
|
CMPQ R12, $0x10
|
|
JBE decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16
|
|
CMPQ R12, $0x20
|
|
JBE decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32
|
|
JMP decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64
|
|
|
|
decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16:
|
|
MOVOU (R8), X0
|
|
MOVOU X0, (SI)
|
|
JMP decodeBlockAsm_fast_litcopy_done
|
|
|
|
decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(R12*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_litcopy_done
|
|
|
|
decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(R12*1), X2
|
|
MOVOU -16(R8)(R12*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(R12*1)
|
|
MOVOU X3, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_litcopy_done
|
|
|
|
decodeBlockAsm_fast_lit_1:
|
|
MOVBQZX 1(R8), R12
|
|
ADDQ $0x02, R8
|
|
JMP decodeBlockAsm_fast_litcopy_long
|
|
|
|
decodeBlockAsm_fast_lit_2:
|
|
MOVWQZX 1(R8), R12
|
|
ADDQ $0x03, R8
|
|
JMP decodeBlockAsm_fast_litcopy_long
|
|
|
|
decodeBlockAsm_fast_lit_3:
|
|
MOVL (R8), R12
|
|
ADDQ $0x04, R8
|
|
SHRL $0x08, R12
|
|
|
|
decodeBlockAsm_fast_litcopy_long:
|
|
LEAQ 30(R12), R12
|
|
LEAQ (SI)(R12*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
BTL $0x00, R11
|
|
JC decodeBlockAsm_fast_copy_exec
|
|
LEAQ (R8)(R12*1), R10
|
|
CMPQ R10, CX
|
|
JA corrupt
|
|
CMPL R12, $0x40
|
|
JBE decodeBlockAsm_fast_litcopy_short_reduced
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(R12*1), X2
|
|
MOVOU -16(R8)(R12*1), X3
|
|
MOVQ R12, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ SI, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R11
|
|
JA decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R13*1), R10
|
|
LEAQ -32(SI)(R13*1), R14
|
|
|
|
decodeBlockAsm_fast_litcopy_longlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R11
|
|
JNA decodeBlockAsm_fast_litcopy_longlarge_big_loop_back
|
|
|
|
decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R13*1), X4
|
|
MOVOU -16(R8)(R13*1), X5
|
|
MOVOA X4, -32(SI)(R13*1)
|
|
MOVOA X5, -16(SI)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R12, R13
|
|
JAE decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(R12*1)
|
|
MOVOU X3, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_litcopy_done
|
|
|
|
decodeBlockAsm_fast_litcopy_short_reduced:
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 30
|
|
CMPQ R12, $0x20
|
|
JBE decodeBlockAsm_fast_lit_longer_copy_memmove_move_17through32
|
|
JMP decodeBlockAsm_fast_lit_longer_copy_memmove_move_33through64
|
|
|
|
decodeBlockAsm_fast_lit_longer_copy_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(R12*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_litcopy_done
|
|
|
|
decodeBlockAsm_fast_lit_longer_copy_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(R12*1), X2
|
|
MOVOU -16(R8)(R12*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(R12*1)
|
|
MOVOU X3, -16(SI)(R12*1)
|
|
|
|
decodeBlockAsm_fast_litcopy_done:
|
|
ADDQ R12, R8
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
CMPQ R8, DX
|
|
JAE decodeBlockAsm_fast_end_done
|
|
MOVBQZX (R8), R10
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
CMPQ SI, BX
|
|
JAE decodeBlockAsm_fast_end_done
|
|
ANDQ $0x03, R10
|
|
JZ decodeBlockAsm_fast_lits
|
|
|
|
decodeBlockAsm_fast_copy:
|
|
MOVL (R8), R13
|
|
CMPL R10, $0x02
|
|
JB decodeBlockAsm_fast_copy_1
|
|
JEQ decodeBlockAsm_fast_copy_2
|
|
JMP decodeBlockAsm_fast_copy_3
|
|
|
|
decodeBlockAsm_fast_copy_1:
|
|
MOVWQZX R13, R9
|
|
ADDQ $0x02, R8
|
|
MOVQ R11, R12
|
|
ANDL $0x0f, R12
|
|
SHRL $0x06, R9
|
|
INCL R9
|
|
SHRL $0x10, R13
|
|
LEAQ 1(R8), R10
|
|
MOVBLZX R13, R11
|
|
ADDL $0x04, R12
|
|
LEAL 18(R11), R11
|
|
CMPL R12, $0x13
|
|
CMOVLEQ R11, R12
|
|
CMOVQEQ R10, R8
|
|
JMP decodeBlockAsm_fast_copy_exec
|
|
|
|
decodeBlockAsm_fast_copy_2:
|
|
MOVQ R11, R12
|
|
CMPL R11, $0x3d
|
|
JB decodeBlockAsm_fast_copy_2_0_extra
|
|
JEQ decodeBlockAsm_fast_copy_2_1_extra
|
|
CMPL R12, $0x3f
|
|
JB decodeBlockAsm_fast_copy_2_2_extra
|
|
MOVWQZX 1(R8), R9
|
|
MOVL 2(R8), R12
|
|
ADDQ $0x06, R8
|
|
SHRL $0x08, R12
|
|
LEAL 64(R12), R12
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_fast_copy_exec_long_long
|
|
|
|
decodeBlockAsm_fast_copy_2_2_extra:
|
|
MOVWQZX 1(R8), R9
|
|
MOVWLZX 3(R8), R12
|
|
ADDQ $0x05, R8
|
|
LEAL 64(R12), R12
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_fast_copy_exec_long_long
|
|
|
|
decodeBlockAsm_fast_copy_2_1_extra:
|
|
MOVL R13, R12
|
|
SHRL $0x08, R13
|
|
SHRL $0x18, R12
|
|
MOVWQZX R13, R9
|
|
ADDQ $0x04, R8
|
|
LEAL 64(R12), R12
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_fast_copy_exec_long_long
|
|
|
|
decodeBlockAsm_fast_copy_2_0_extra:
|
|
SHRL $0x08, R13
|
|
MOVWQZX R13, R9
|
|
LEAQ 3(R8), R8
|
|
LEAL 4(R12), R12
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_fast_copy_short_no_ol
|
|
|
|
decodeBlockAsm_fast_copy_3:
|
|
MOVL R13, R9
|
|
ADDQ $0x04, R8
|
|
MOVQ R11, R10
|
|
SHRQ $0x01, R10
|
|
ANDQ $0x03, R10
|
|
BTL $0x00, R11
|
|
JC decodeBlockAsm_fast_copy3_read
|
|
SHRL $0x03, R11
|
|
ANDL $0x07, R11
|
|
LEAL 4(R11), R12
|
|
SHRL $0x08, R13
|
|
MOVWQZX R13, R9
|
|
DECQ R8
|
|
INCQ R10
|
|
MOVL (R8), R11
|
|
MOVL R11, (SI)
|
|
ADDQ $0x40, R9
|
|
ADDQ R10, R8
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
JMP decodeBlockAsm_fast_copy_short_no_ol
|
|
|
|
decodeBlockAsm_fast_copy3_read:
|
|
MOVL R9, R12
|
|
SHRL $0x05, R12
|
|
ANDL $0x3f, R12
|
|
SHRL $0x0b, R9
|
|
ADDL $0x00010000, R9
|
|
CMPL R12, $0x3d
|
|
JB decodeBlockAsm_fast_copy_3_0_extra
|
|
JEQ decodeBlockAsm_fast_copy_3_1_extra
|
|
CMPL R12, $0x3e
|
|
JEQ decodeBlockAsm_fast_copy_3_2_extra
|
|
MOVL -1(R8), R12
|
|
ADDQ $0x03, R8
|
|
SHRL $0x08, R12
|
|
LEAL 64(R12), R12
|
|
JMP decodeBlockAsm_fast_copy_fused_long
|
|
|
|
decodeBlockAsm_fast_copy_3_2_extra:
|
|
MOVWLZX (R8), R12
|
|
ADDQ $0x02, R8
|
|
LEAL 64(R12), R12
|
|
JMP decodeBlockAsm_fast_copy_fused_long
|
|
|
|
decodeBlockAsm_fast_copy_3_1_extra:
|
|
MOVBLZX (R8), R12
|
|
ADDQ $0x01, R8
|
|
LEAL 64(R12), R12
|
|
JMP decodeBlockAsm_fast_copy_fused_long
|
|
|
|
decodeBlockAsm_fast_copy_3_0_extra:
|
|
LEAL 4(R12), R12
|
|
MOVL (R8), R11
|
|
MOVL R11, (SI)
|
|
ADDQ R10, R8
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
JMP decodeBlockAsm_fast_copy_short_no_ol
|
|
|
|
decodeBlockAsm_fast_copy_fused_long:
|
|
MOVL (R8), R11
|
|
MOVL R11, (SI)
|
|
ADDQ R10, R8
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
JMP decodeBlockAsm_fast_copy_exec_long_long
|
|
|
|
decodeBlockAsm_fast_copy_exec_short:
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(R12*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
|
|
// Prefetch next tag
|
|
MOVBQZX (R8), R10
|
|
MOVQ SI, R11
|
|
SUBQ R9, R11
|
|
CMPL R9, R12
|
|
JB decodeBlockAsm_fast_copy_overlap
|
|
JMP decodeBlockAsm_fast_copy_short
|
|
|
|
decodeBlockAsm_fast_copy_exec_long_long:
|
|
MOVQ SI, R11
|
|
SUBQ R9, R11
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(R12*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
|
|
// Prefetch next tag
|
|
MOVBQZX (R8), R10
|
|
|
|
// genMemMoveLong
|
|
MOVQ R12, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ SI, R14
|
|
MOVQ R12, R15
|
|
|
|
decodeBlockAsm_fast_copy_long_longlarge_big_loop_back:
|
|
MOVOU (R11), X0
|
|
MOVOU 16(R11), X1
|
|
MOVOU X0, (R14)
|
|
MOVOU X1, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
SUBQ $0x20, R15
|
|
DECQ R13
|
|
JNZ decodeBlockAsm_fast_copy_long_longlarge_big_loop_back
|
|
TESTQ R15, R15
|
|
JZ decodeBlockAsm_fast_copy_done
|
|
MOVOU -32(R11)(R15*1), X0
|
|
MOVOU -16(R11)(R15*1), X1
|
|
MOVOU X0, -32(R14)(R15*1)
|
|
MOVOU X1, -16(R14)(R15*1)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_short_no_ol:
|
|
MOVQ SI, R11
|
|
SUBQ R9, R11
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(R12*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
|
|
// Prefetch next tag
|
|
MOVBQZX (R8), R10
|
|
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 4
|
|
CMPQ R12, $0x10
|
|
JBE decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16
|
|
CMPQ R12, $0x20
|
|
JBE decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32
|
|
JMP decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64
|
|
|
|
decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16:
|
|
MOVOU (R11), X0
|
|
MOVOU X0, (SI)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32:
|
|
MOVOU (R11), X0
|
|
MOVOU -16(R11)(R12*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64:
|
|
MOVOU (R11), X0
|
|
MOVOU 16(R11), X1
|
|
MOVOU -32(R11)(R12*1), X2
|
|
MOVOU -16(R11)(R12*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(R12*1)
|
|
MOVOU X3, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_exec:
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(R12*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
MOVQ SI, R11
|
|
SUBQ R9, R11
|
|
|
|
// Prefetch next tag
|
|
MOVBQZX (R8), R10
|
|
CMPL R9, R12
|
|
JB decodeBlockAsm_fast_copy_overlap
|
|
CMPL R12, $0x40
|
|
JA decodeBlockAsm_fast_copy_long
|
|
|
|
decodeBlockAsm_fast_copy_short:
|
|
// genMemMoveShort
|
|
// margin: 16, min move: 1
|
|
CMPQ R12, $0x10
|
|
JBE decodeBlockAsm_fast_copy_short_memmove_move_8through16
|
|
CMPQ R12, $0x20
|
|
JBE decodeBlockAsm_fast_copy_short_memmove_move_17through32
|
|
JMP decodeBlockAsm_fast_copy_short_memmove_move_33through64
|
|
|
|
decodeBlockAsm_fast_copy_short_memmove_move_8through16:
|
|
MOVOU (R11), X0
|
|
MOVOU X0, (SI)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_short_memmove_move_17through32:
|
|
MOVOU (R11), X0
|
|
MOVOU -16(R11)(R12*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_short_memmove_move_33through64:
|
|
MOVOU (R11), X0
|
|
MOVOU 16(R11), X1
|
|
MOVOU -32(R11)(R12*1), X2
|
|
MOVOU -16(R11)(R12*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(R12*1)
|
|
MOVOU X3, -16(SI)(R12*1)
|
|
JMP decodeBlockAsm_fast_copy_done
|
|
|
|
decodeBlockAsm_fast_copy_long:
|
|
// genMemMoveLong
|
|
MOVOU (R11), X0
|
|
MOVOU 16(R11), X1
|
|
MOVOU -32(R11)(R12*1), X2
|
|
MOVOU -16(R11)(R12*1), X3
|
|
MOVQ R12, R14
|
|
SHRQ $0x05, R14
|
|
MOVQ SI, R13
|
|
ANDL $0x0000001f, R13
|
|
MOVQ $0x00000040, R15
|
|
SUBQ R13, R15
|
|
DECQ R14
|
|
JA decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32
|
|
LEAQ -32(R11)(R15*1), R13
|
|
LEAQ -32(SI)(R15*1), BP
|
|
|
|
decodeBlockAsm_fast_copy_longlarge_big_loop_back:
|
|
MOVOU (R13), X4
|
|
MOVOU 16(R13), X5
|
|
MOVOA X4, (BP)
|
|
MOVOA X5, 16(BP)
|
|
ADDQ $0x20, BP
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R15
|
|
DECQ R14
|
|
JNA decodeBlockAsm_fast_copy_longlarge_big_loop_back
|
|
|
|
decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32:
|
|
MOVOU -32(R11)(R15*1), X4
|
|
MOVOU -16(R11)(R15*1), X5
|
|
MOVOA X4, -32(SI)(R15*1)
|
|
MOVOA X5, -16(SI)(R15*1)
|
|
ADDQ $0x20, R15
|
|
CMPQ R12, R15
|
|
JAE decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(R12*1)
|
|
MOVOU X3, -16(SI)(R12*1)
|
|
|
|
decodeBlockAsm_fast_copy_done:
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
CMPQ R8, DX
|
|
JB decodeBlockAsm_fast_loop_nofetch
|
|
JMP decodeBlockAsm_fast_end_copy
|
|
|
|
decodeBlockAsm_fast_copy_overlap:
|
|
CMPL R9, $0x03
|
|
JA decodeBlockAsm_fast_copy_overlap_4
|
|
JE decodeBlockAsm_fast_copy_overlap_3
|
|
CMPL R9, $0x02
|
|
JE decodeBlockAsm_fast_copy_overlap_2
|
|
MOVB (R11), R11
|
|
ADDQ R12, DI
|
|
|
|
decodeBlockAsm_fast_loop_overlap_1:
|
|
MOVB R11, (SI)
|
|
INCQ SI
|
|
DECQ R12
|
|
JNZ decodeBlockAsm_fast_loop_overlap_1
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
CMPQ R8, DX
|
|
JB decodeBlockAsm_fast_loop_nofetch
|
|
JMP decodeBlockAsm_fast_end_copy
|
|
|
|
decodeBlockAsm_fast_copy_overlap_2:
|
|
MOVW (R11), R13
|
|
ADDQ R12, DI
|
|
BTL $0x00, R12
|
|
JNC decodeBlockAsm_fast_loop_overlap_2
|
|
MOVB R13, (SI)
|
|
MOVW 1(R11), R13
|
|
INCQ SI
|
|
DECQ R12
|
|
|
|
decodeBlockAsm_fast_loop_overlap_2:
|
|
MOVW R13, (SI)
|
|
ADDQ $0x02, SI
|
|
SUBQ $0x02, R12
|
|
JNZ decodeBlockAsm_fast_loop_overlap_2
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
CMPQ R8, DX
|
|
JB decodeBlockAsm_fast_loop_nofetch
|
|
JMP decodeBlockAsm_fast_end_copy
|
|
|
|
decodeBlockAsm_fast_copy_overlap_3:
|
|
MOVL (R11), R13
|
|
ADDQ R12, DI
|
|
SUBQ $0x03, R12
|
|
|
|
decodeBlockAsm_fast_loop_overlap_3:
|
|
MOVL R13, (SI)
|
|
ADDQ $0x03, SI
|
|
SUBQ $0x03, R12
|
|
JA decodeBlockAsm_fast_loop_overlap_3
|
|
MOVW 3(R11)(R12*1), R13
|
|
MOVW R13, (SI)(R12*1)
|
|
MOVB 5(R11)(R12*1), R13
|
|
MOVB R13, 2(SI)(R12*1)
|
|
LEAQ 3(SI)(R12*1), SI
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
CMPQ R8, DX
|
|
JB decodeBlockAsm_fast_loop_nofetch
|
|
JMP decodeBlockAsm_fast_end_copy
|
|
|
|
decodeBlockAsm_fast_copy_overlap_4:
|
|
ADDQ R12, DI
|
|
SUBQ $0x04, R12
|
|
|
|
decodeBlockAsm_fast_loop_overlap_4:
|
|
MOVL (R11), R13
|
|
ADDQ $0x04, R11
|
|
MOVL R13, (SI)
|
|
ADDQ $0x04, SI
|
|
SUBQ $0x04, R12
|
|
JA decodeBlockAsm_fast_loop_overlap_4
|
|
MOVL (R11)(R12*1), R13
|
|
MOVL R13, (SI)(R12*1)
|
|
LEAQ 4(SI)(R12*1), SI
|
|
MOVQ R10, R11
|
|
SHRQ $0x02, R11
|
|
CMPQ R8, DX
|
|
JB decodeBlockAsm_fast_loop_nofetch
|
|
|
|
decodeBlockAsm_fast_end_copy:
|
|
decodeBlockAsm_fast_end_done:
|
|
decodeBlockAsm_remain_loop:
|
|
CMPQ R8, CX
|
|
JAE decodeBlockAsm_remain_end_copy
|
|
MOVBQZX (R8), DX
|
|
MOVQ DX, BX
|
|
SHRQ $0x02, BX
|
|
CMPQ SI, AX
|
|
JAE decodeBlockAsm_remain_end_copy
|
|
ANDQ $0x03, DX
|
|
JNZ decodeBlockAsm_remain_copy
|
|
|
|
decodeBlockAsm_remain_lits:
|
|
MOVL BX, DX
|
|
SHRL $0x01, DX
|
|
CMPL DX, $0x1d
|
|
JB decodeBlockAsm_remain_lit_0
|
|
JEQ decodeBlockAsm_remain_lit_1
|
|
CMPL DX, $0x1e
|
|
JEQ decodeBlockAsm_remain_lit_2
|
|
JMP decodeBlockAsm_remain_lit_3
|
|
|
|
decodeBlockAsm_remain_lit_0:
|
|
INCQ R8
|
|
INCL DX
|
|
LEAQ (SI)(DX*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
BTL $0x00, BX
|
|
JC decodeBlockAsm_remain_copy_exec_short
|
|
LEAQ (R8)(DX*1), BX
|
|
CMPQ BX, CX
|
|
JA corrupt
|
|
|
|
// genMemMoveShort
|
|
// margin: -1, min move: 1
|
|
CMPQ DX, $0x03
|
|
JB decodeBlockAsm_remain_lit_0_copy_memmove_move_1or2
|
|
JE decodeBlockAsm_remain_lit_0_copy_memmove_move_3
|
|
CMPQ DX, $0x08
|
|
JBE decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8
|
|
CMPQ DX, $0x10
|
|
JBE decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16
|
|
CMPQ DX, $0x20
|
|
JBE decodeBlockAsm_remain_lit_0_copy_memmove_move_17through32
|
|
JMP decodeBlockAsm_remain_lit_0_copy_memmove_move_33through64
|
|
|
|
decodeBlockAsm_remain_lit_0_copy_memmove_move_1or2:
|
|
MOVB (R8), BL
|
|
MOVB -1(R8)(DX*1), R10
|
|
MOVB BL, (SI)
|
|
MOVB R10, -1(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_0_copy_memmove_move_3:
|
|
MOVW (R8), BX
|
|
MOVB 2(R8), R10
|
|
MOVW BX, (SI)
|
|
MOVB R10, 2(SI)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8:
|
|
MOVL (R8), BX
|
|
MOVL -4(R8)(DX*1), R10
|
|
MOVL BX, (SI)
|
|
MOVL R10, -4(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16:
|
|
MOVQ (R8), BX
|
|
MOVQ -8(R8)(DX*1), R10
|
|
MOVQ BX, (SI)
|
|
MOVQ R10, -8(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_0_copy_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DX*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_0_copy_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DX*1), X2
|
|
MOVOU -16(R8)(DX*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(DX*1)
|
|
MOVOU X3, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_1:
|
|
ADDQ $0x02, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVBQZX -1(R8), DX
|
|
JMP decodeBlockAsm_remain_litcopy_long
|
|
|
|
decodeBlockAsm_remain_lit_2:
|
|
ADDQ $0x03, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWQZX -2(R8), DX
|
|
JMP decodeBlockAsm_remain_litcopy_long
|
|
|
|
decodeBlockAsm_remain_lit_3:
|
|
ADDQ $0x04, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVL -4(R8), DX
|
|
SHRL $0x08, DX
|
|
|
|
decodeBlockAsm_remain_litcopy_long:
|
|
LEAQ 30(DX), DX
|
|
LEAQ (SI)(DX*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
BTL $0x00, BX
|
|
JC decodeBlockAsm_remain_copy_exec
|
|
LEAQ (R8)(DX*1), BX
|
|
CMPQ BX, CX
|
|
JA corrupt
|
|
CMPL DX, $0x40
|
|
JBE decodeBlockAsm_remain_litcopy_short_reduced
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DX*1), X2
|
|
MOVOU -16(R8)(DX*1), X3
|
|
MOVQ DX, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ SI, BX
|
|
ANDL $0x0000001f, BX
|
|
MOVQ $0x00000040, R11
|
|
SUBQ BX, R11
|
|
DECQ R10
|
|
JA decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32
|
|
LEAQ -32(R8)(R11*1), BX
|
|
LEAQ -32(SI)(R11*1), R12
|
|
|
|
decodeBlockAsm_remain_litcopy_longlarge_big_loop_back:
|
|
MOVOU (BX), X4
|
|
MOVOU 16(BX), X5
|
|
MOVOA X4, (R12)
|
|
MOVOA X5, 16(R12)
|
|
ADDQ $0x20, R12
|
|
ADDQ $0x20, BX
|
|
ADDQ $0x20, R11
|
|
DECQ R10
|
|
JNA decodeBlockAsm_remain_litcopy_longlarge_big_loop_back
|
|
|
|
decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32:
|
|
MOVOU -32(R8)(R11*1), X4
|
|
MOVOU -16(R8)(R11*1), X5
|
|
MOVOA X4, -32(SI)(R11*1)
|
|
MOVOA X5, -16(SI)(R11*1)
|
|
ADDQ $0x20, R11
|
|
CMPQ DX, R11
|
|
JAE decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(DX*1)
|
|
MOVOU X3, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_litcopy_short_reduced:
|
|
// genMemMoveShort
|
|
// margin: -4, min move: 30
|
|
CMPQ DX, $0x20
|
|
JBE decodeBlockAsm_remain_lit_longer_copy_memmove_move_17through32
|
|
JMP decodeBlockAsm_remain_lit_longer_copy_memmove_move_33through64
|
|
|
|
decodeBlockAsm_remain_lit_longer_copy_memmove_move_17through32:
|
|
MOVOU (R8), X0
|
|
MOVOU -16(R8)(DX*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_litcopy_done
|
|
|
|
decodeBlockAsm_remain_lit_longer_copy_memmove_move_33through64:
|
|
MOVOU (R8), X0
|
|
MOVOU 16(R8), X1
|
|
MOVOU -32(R8)(DX*1), X2
|
|
MOVOU -16(R8)(DX*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(DX*1)
|
|
MOVOU X3, -16(SI)(DX*1)
|
|
|
|
decodeBlockAsm_remain_litcopy_done:
|
|
ADDQ DX, R8
|
|
ADDQ DX, SI
|
|
ADDQ DX, DI
|
|
CMPQ R8, CX
|
|
JAE decodeBlockAsm_remain_end_done
|
|
MOVBQZX (R8), DX
|
|
MOVQ DX, BX
|
|
SHRQ $0x02, BX
|
|
CMPQ SI, AX
|
|
JAE decodeBlockAsm_remain_end_done
|
|
ANDQ $0x03, DX
|
|
JZ decodeBlockAsm_remain_lits
|
|
|
|
decodeBlockAsm_remain_copy:
|
|
CMPL DX, $0x02
|
|
JB decodeBlockAsm_remain_copy_1
|
|
JEQ decodeBlockAsm_remain_copy_2
|
|
JMP decodeBlockAsm_remain_copy_3
|
|
|
|
decodeBlockAsm_remain_copy_1:
|
|
ADDQ $0x02, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWQZX -2(R8), R9
|
|
MOVQ BX, DX
|
|
ANDL $0x0f, DX
|
|
SHRL $0x06, R9
|
|
INCL R9
|
|
CMPL DX, $0x0f
|
|
JNE decodeBlockAsm_remain_copy_1_short
|
|
ADDQ $0x01, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVBLZX -1(R8), DX
|
|
LEAL 18(DX), DX
|
|
JMP decodeBlockAsm_remain_copy_exec
|
|
|
|
decodeBlockAsm_remain_copy_1_short:
|
|
LEAL 4(DX), DX
|
|
JMP decodeBlockAsm_remain_copy_exec_short
|
|
|
|
decodeBlockAsm_remain_copy_2:
|
|
MOVQ BX, DX
|
|
CMPL BX, $0x3d
|
|
JB decodeBlockAsm_remain_copy_2_0_extra
|
|
JEQ decodeBlockAsm_remain_copy_2_1_extra
|
|
CMPL DX, $0x3f
|
|
JB decodeBlockAsm_remain_copy_2_2_extra
|
|
ADDQ $0x06, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWQZX -5(R8), R9
|
|
MOVL -4(R8), DX
|
|
SHRL $0x08, DX
|
|
LEAL 64(DX), DX
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_remain_copy_exec_long_long
|
|
|
|
decodeBlockAsm_remain_copy_2_2_extra:
|
|
ADDQ $0x05, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWQZX -4(R8), R9
|
|
MOVWLZX -2(R8), DX
|
|
LEAL 64(DX), DX
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_remain_copy_exec_long_long
|
|
|
|
decodeBlockAsm_remain_copy_2_1_extra:
|
|
ADDQ $0x04, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWQZX -3(R8), R9
|
|
MOVBLZX -1(R8), DX
|
|
LEAL 64(DX), DX
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_remain_copy_exec_long_long
|
|
|
|
decodeBlockAsm_remain_copy_2_0_extra:
|
|
LEAQ 3(R8), R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWQZX -2(R8), R9
|
|
LEAL 4(DX), DX
|
|
ADDQ $0x40, R9
|
|
JMP decodeBlockAsm_remain_copy_short_no_ol
|
|
|
|
decodeBlockAsm_remain_copy_3:
|
|
ADDQ $0x04, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVL -4(R8), R9
|
|
MOVQ BX, R10
|
|
SHRQ $0x01, R10
|
|
ANDQ $0x03, R10
|
|
BTL $0x00, BX
|
|
JC decodeBlockAsm_remain_copy3_read
|
|
SHRL $0x03, BX
|
|
ANDL $0x07, BX
|
|
LEAL 4(BX), DX
|
|
MOVWQZX -3(R8), R9
|
|
DECQ R8
|
|
INCQ R10
|
|
LEAQ (R8)(R10*1), BX
|
|
LEAQ (SI)(R10*1), R11
|
|
CMPQ BX, CX
|
|
JA corrupt
|
|
CMPQ R11, AX
|
|
JA corrupt
|
|
|
|
// genMemMoveVeryShort
|
|
CMPQ R10, $0x03
|
|
JE decodeBlockAsm_remain_copy2_fused_lits_move_3
|
|
JA decodeBlockAsm_remain_copy2_fused_lits_move_4
|
|
MOVB (R8), BL
|
|
MOVB -1(R8)(R10*1), R11
|
|
MOVB BL, (SI)
|
|
MOVB R11, -1(SI)(R10*1)
|
|
JMP decodeBlockAsm_remain_copy2_fused_lits_done
|
|
|
|
decodeBlockAsm_remain_copy2_fused_lits_move_3:
|
|
MOVW (R8), BX
|
|
MOVB 2(R8), R11
|
|
MOVW BX, (SI)
|
|
MOVB R11, 2(SI)
|
|
JMP decodeBlockAsm_remain_copy2_fused_lits_done
|
|
|
|
decodeBlockAsm_remain_copy2_fused_lits_move_4:
|
|
MOVL (R8), BX
|
|
MOVL BX, (SI)
|
|
|
|
decodeBlockAsm_remain_copy2_fused_lits_done:
|
|
ADDQ $0x40, R9
|
|
ADDQ R10, R8
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
JMP decodeBlockAsm_remain_copy_short_no_ol
|
|
|
|
decodeBlockAsm_remain_copy3_read:
|
|
MOVL R9, DX
|
|
SHRL $0x05, DX
|
|
ANDL $0x3f, DX
|
|
SHRL $0x0b, R9
|
|
ADDL $0x00010000, R9
|
|
CMPL DX, $0x3d
|
|
JB decodeBlockAsm_remain_copy_3_0_extra
|
|
JEQ decodeBlockAsm_remain_copy_3_1_extra
|
|
CMPL DX, $0x3e
|
|
JEQ decodeBlockAsm_remain_copy_3_2_extra
|
|
ADDQ $0x03, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVL -4(R8), DX
|
|
SHRL $0x08, DX
|
|
LEAL 64(DX), DX
|
|
JMP decodeBlockAsm_remain_copy_fused_long
|
|
|
|
decodeBlockAsm_remain_copy_3_2_extra:
|
|
ADDQ $0x02, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVWLZX -2(R8), DX
|
|
LEAL 64(DX), DX
|
|
JMP decodeBlockAsm_remain_copy_fused_long
|
|
|
|
decodeBlockAsm_remain_copy_3_1_extra:
|
|
ADDQ $0x01, R8
|
|
CMPQ R8, CX
|
|
JA corrupt
|
|
MOVBLZX -1(R8), DX
|
|
LEAL 64(DX), DX
|
|
JMP decodeBlockAsm_remain_copy_fused_long
|
|
|
|
decodeBlockAsm_remain_copy_3_0_extra:
|
|
LEAL 4(DX), DX
|
|
TESTL R10, R10
|
|
JZ decodeBlockAsm_remain_copy_short_no_ol
|
|
LEAQ (R8)(R10*1), BX
|
|
LEAQ (SI)(R10*1), R11
|
|
CMPQ BX, CX
|
|
JA corrupt
|
|
CMPQ R11, AX
|
|
JA corrupt
|
|
|
|
// genMemMoveVeryShort
|
|
CMPQ R10, $0x03
|
|
JE decodeBlockAsm_remain_copy3s_fused_lits_move_3
|
|
JA decodeBlockAsm_remain_copy3s_fused_lits_move_4
|
|
MOVB (R8), BL
|
|
MOVB -1(R8)(R10*1), R11
|
|
MOVB BL, (SI)
|
|
MOVB R11, -1(SI)(R10*1)
|
|
JMP decodeBlockAsm_remain_copy3s_fused_lits_done
|
|
|
|
decodeBlockAsm_remain_copy3s_fused_lits_move_3:
|
|
MOVW (R8), BX
|
|
MOVB 2(R8), R11
|
|
MOVW BX, (SI)
|
|
MOVB R11, 2(SI)
|
|
JMP decodeBlockAsm_remain_copy3s_fused_lits_done
|
|
|
|
decodeBlockAsm_remain_copy3s_fused_lits_move_4:
|
|
MOVL (R8), BX
|
|
MOVL BX, (SI)
|
|
|
|
decodeBlockAsm_remain_copy3s_fused_lits_done:
|
|
ADDQ R10, R8
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
JMP decodeBlockAsm_remain_copy_short_no_ol
|
|
|
|
decodeBlockAsm_remain_copy_fused_long:
|
|
TESTL R10, R10
|
|
JZ decodeBlockAsm_remain_copy_exec_long_long
|
|
LEAQ (R8)(R10*1), BX
|
|
LEAQ (SI)(R10*1), R11
|
|
CMPQ BX, CX
|
|
JA corrupt
|
|
CMPQ R11, AX
|
|
JA corrupt
|
|
|
|
// genMemMoveVeryShort
|
|
CMPQ R10, $0x03
|
|
JE decodeBlockAsm_remain_copy3_fused_lits_move_3
|
|
JA decodeBlockAsm_remain_copy3_fused_lits_move_4
|
|
MOVB (R8), BL
|
|
MOVB -1(R8)(R10*1), R11
|
|
MOVB BL, (SI)
|
|
MOVB R11, -1(SI)(R10*1)
|
|
JMP decodeBlockAsm_remain_copy3_fused_lits_done
|
|
|
|
decodeBlockAsm_remain_copy3_fused_lits_move_3:
|
|
MOVW (R8), BX
|
|
MOVB 2(R8), R11
|
|
MOVW BX, (SI)
|
|
MOVB R11, 2(SI)
|
|
JMP decodeBlockAsm_remain_copy3_fused_lits_done
|
|
|
|
decodeBlockAsm_remain_copy3_fused_lits_move_4:
|
|
MOVL (R8), BX
|
|
MOVL BX, (SI)
|
|
|
|
decodeBlockAsm_remain_copy3_fused_lits_done:
|
|
ADDQ R10, R8
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
JMP decodeBlockAsm_remain_copy_exec_long_long
|
|
|
|
decodeBlockAsm_remain_copy_exec_short:
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(DX*1), BX
|
|
CMPQ BX, AX
|
|
JA corrupt
|
|
MOVQ SI, BX
|
|
SUBQ R9, BX
|
|
CMPL R9, DX
|
|
JB decodeBlockAsm_remain_copy_overlap
|
|
JMP decodeBlockAsm_remain_copy_short
|
|
|
|
decodeBlockAsm_remain_copy_exec_long_long:
|
|
MOVQ SI, BX
|
|
SUBQ R9, BX
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(DX*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
|
|
// genMemMoveLong
|
|
MOVQ DX, R10
|
|
SHRQ $0x05, R10
|
|
MOVQ SI, R11
|
|
MOVQ DX, R12
|
|
|
|
decodeBlockAsm_remain_copy_long_longlarge_big_loop_back:
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU X0, (R11)
|
|
MOVOU X1, 16(R11)
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, BX
|
|
SUBQ $0x20, R12
|
|
DECQ R10
|
|
JNZ decodeBlockAsm_remain_copy_long_longlarge_big_loop_back
|
|
TESTQ R12, R12
|
|
JZ decodeBlockAsm_remain_copy_done
|
|
MOVOU -32(BX)(R12*1), X0
|
|
MOVOU -16(BX)(R12*1), X1
|
|
MOVOU X0, -32(R11)(R12*1)
|
|
MOVOU X1, -16(R11)(R12*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_no_ol:
|
|
MOVQ SI, BX
|
|
SUBQ R9, BX
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(DX*1), R10
|
|
CMPQ R10, AX
|
|
JA corrupt
|
|
|
|
// genMemMoveShort
|
|
// margin: -4, min move: 4
|
|
CMPQ DX, $0x08
|
|
JBE decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8
|
|
CMPQ DX, $0x10
|
|
JBE decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16
|
|
CMPQ DX, $0x20
|
|
JBE decodeBlockAsm_remain_copy_short_no_ol_memmove_move_17through32
|
|
JMP decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64
|
|
|
|
decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8:
|
|
MOVL (BX), R10
|
|
MOVL -4(BX)(DX*1), BX
|
|
MOVL R10, (SI)
|
|
MOVL BX, -4(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16:
|
|
MOVQ (BX), R10
|
|
MOVQ -8(BX)(DX*1), BX
|
|
MOVQ R10, (SI)
|
|
MOVQ BX, -8(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_no_ol_memmove_move_17through32:
|
|
MOVOU (BX), X0
|
|
MOVOU -16(BX)(DX*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64:
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU -32(BX)(DX*1), X2
|
|
MOVOU -16(BX)(DX*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(DX*1)
|
|
MOVOU X3, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_exec:
|
|
CMPL R9, DI
|
|
JA corrupt
|
|
LEAQ (SI)(DX*1), BX
|
|
CMPQ BX, AX
|
|
JA corrupt
|
|
MOVQ SI, BX
|
|
SUBQ R9, BX
|
|
CMPL R9, DX
|
|
JB decodeBlockAsm_remain_copy_overlap
|
|
CMPL DX, $0x40
|
|
JA decodeBlockAsm_remain_copy_long
|
|
|
|
decodeBlockAsm_remain_copy_short:
|
|
// genMemMoveShort
|
|
// margin: -4, min move: 1
|
|
CMPQ DX, $0x03
|
|
JB decodeBlockAsm_remain_copy_short_memmove_move_1or2
|
|
JE decodeBlockAsm_remain_copy_short_memmove_move_3
|
|
CMPQ DX, $0x08
|
|
JBE decodeBlockAsm_remain_copy_short_memmove_move_4through8
|
|
CMPQ DX, $0x10
|
|
JBE decodeBlockAsm_remain_copy_short_memmove_move_8through16
|
|
CMPQ DX, $0x20
|
|
JBE decodeBlockAsm_remain_copy_short_memmove_move_17through32
|
|
JMP decodeBlockAsm_remain_copy_short_memmove_move_33through64
|
|
|
|
decodeBlockAsm_remain_copy_short_memmove_move_1or2:
|
|
MOVB (BX), R10
|
|
MOVB -1(BX)(DX*1), BL
|
|
MOVB R10, (SI)
|
|
MOVB BL, -1(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_memmove_move_3:
|
|
MOVW (BX), R10
|
|
MOVB 2(BX), BL
|
|
MOVW R10, (SI)
|
|
MOVB BL, 2(SI)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_memmove_move_4through8:
|
|
MOVL (BX), R10
|
|
MOVL -4(BX)(DX*1), BX
|
|
MOVL R10, (SI)
|
|
MOVL BX, -4(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_memmove_move_8through16:
|
|
MOVQ (BX), R10
|
|
MOVQ -8(BX)(DX*1), BX
|
|
MOVQ R10, (SI)
|
|
MOVQ BX, -8(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_memmove_move_17through32:
|
|
MOVOU (BX), X0
|
|
MOVOU -16(BX)(DX*1), X1
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_short_memmove_move_33through64:
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU -32(BX)(DX*1), X2
|
|
MOVOU -16(BX)(DX*1), X3
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(DX*1)
|
|
MOVOU X3, -16(SI)(DX*1)
|
|
JMP decodeBlockAsm_remain_copy_done
|
|
|
|
decodeBlockAsm_remain_copy_long:
|
|
// genMemMoveLong
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU -32(BX)(DX*1), X2
|
|
MOVOU -16(BX)(DX*1), X3
|
|
MOVQ DX, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ SI, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32
|
|
LEAQ -32(BX)(R12*1), R10
|
|
LEAQ -32(SI)(R12*1), R13
|
|
|
|
decodeBlockAsm_remain_copy_longlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA decodeBlockAsm_remain_copy_longlarge_big_loop_back
|
|
|
|
decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32:
|
|
MOVOU -32(BX)(R12*1), X4
|
|
MOVOU -16(BX)(R12*1), X5
|
|
MOVOA X4, -32(SI)(R12*1)
|
|
MOVOA X5, -16(SI)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ DX, R12
|
|
JAE decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32
|
|
MOVOU X0, (SI)
|
|
MOVOU X1, 16(SI)
|
|
MOVOU X2, -32(SI)(DX*1)
|
|
MOVOU X3, -16(SI)(DX*1)
|
|
|
|
decodeBlockAsm_remain_copy_done:
|
|
ADDQ DX, SI
|
|
ADDQ DX, DI
|
|
JMP decodeBlockAsm_remain_loop
|
|
|
|
decodeBlockAsm_remain_copy_overlap:
|
|
ADDQ DX, DI
|
|
|
|
decodeBlockAsm_remain_copy_overlap_simple:
|
|
MOVB (BX), R10
|
|
MOVB R10, (SI)
|
|
INCQ BX
|
|
INCQ SI
|
|
DECQ DX
|
|
JNZ decodeBlockAsm_remain_copy_overlap_simple
|
|
JMP decodeBlockAsm_remain_loop
|
|
|
|
decodeBlockAsm_remain_end_copy:
|
|
decodeBlockAsm_remain_end_done:
|
|
MOVQ src_base+24(FP), AX
|
|
MOVQ src_len+32(FP), CX
|
|
MOVQ dst_base+0(FP), DX
|
|
MOVQ dst_len+8(FP), BX
|
|
LEAQ (DX)(BX*1), DX
|
|
LEAQ (AX)(CX*1), AX
|
|
CMPQ SI, DX
|
|
JNE corrupt
|
|
CMPQ R8, AX
|
|
JNE corrupt
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
corrupt:
|
|
MOVQ $0x00000001, ret+48(FP)
|
|
RET
|