// Copyright 2025 MinIO Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package minlz

import (
	"fmt"
	"math"
	"math/bits"
	"sync"
)

// pools with hash tables for best encoding.
var encBestLPool sync.Pool
var encBestSPool sync.Pool

// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
//
//	len(dst) >= MaxEncodedLen(len(src)) &&
//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBest(dst, src []byte, dict *dict) (d int) {
	// Initialize the hash tables.
	// TODO: dict
	const (
		// Long hash matches.
		lTableBits    = 20
		maxLTableSize = 1 << lTableBits

		// Short hash matches.
		sTableBits    = 18
		maxSTableSize = 1 << sTableBits

		inputMargin = 8 + 2

		debug = debugEncode
	)

	// sLimit is when to stop looking for offset/length copies. The inputMargin
	// lets us use a fast path for emitLiteral in the main loop, while we are
	// looking for copies.
	sLimit := len(src) - inputMargin
	if len(src) < minNonLiteralBlockSize {
		return 0
	}
	sLimitDict := len(src) - inputMargin
	if sLimitDict > maxDictSrcOffset-inputMargin {
		sLimitDict = maxDictSrcOffset - inputMargin
	}

	var lTable *[maxLTableSize]uint64
	if t := encBestLPool.Get(); t != nil {
		lTable = t.(*[maxLTableSize]uint64)
		*lTable = [maxLTableSize]uint64{}
	} else {
		lTable = new([maxLTableSize]uint64)
	}
	defer encBestLPool.Put(lTable)

	var sTable *[maxSTableSize]uint64
	if t := encBestSPool.Get(); t != nil {
		sTable = t.(*[maxSTableSize]uint64)
		*sTable = [maxSTableSize]uint64{}
	} else {
		sTable = new([maxSTableSize]uint64)
	}
	defer encBestSPool.Put(sTable)

	//var lTable [maxLTableSize]uint64
	//var sTable [maxSTableSize]uint64

	// Bail if we can't compress to at least this.
	dstLimit := len(src) - 5

	// nextEmit is where in src the next emitLiteral should start from.
	nextEmit := 0

	// The encoded form must start with a literal, as there are no previous
	// bytes to copy, so we start looking for hash matches at s == 1.
	s := 1
	repeat := 1
	if dict != nil {
		//dict.initBest()
		s = 0
		repeat = len(dict.dict) - dict.repeat
	}
	cv := load64(src, s)

	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
	const lowbitMask = 0xffffffff
	getCur := func(x uint64) int {
		return int(x & lowbitMask)
	}
	getPrev := func(x uint64) int {
		return int(x >> 32)
	}
	const maxSkip = 64

	if debugEncode {
		fmt.Println("encodeBlockBest: Starting encode")
	}
	for {
		type match struct {
			offset    int
			s         int
			length    int
			score     int
			rep, dict bool
			nextrep   bool
		}
		var best match
		for {
			// Next src position to check
			nextS := (s-nextEmit)>>8 + 1
			if nextS > maxSkip {
				nextS = s + maxSkip
			} else {
				nextS += s
			}
			if nextS > sLimit {
				goto emitRemainder
			}
			if dict != nil && s >= maxDictSrcOffset {
				dict = nil
				if repeat > s {
					repeat = math.MinInt32
				}
			}
			hashL := hash8(cv, lTableBits)
			hashS := hash4(cv, sTableBits)
			candidateL := lTable[hashL]
			candidateS := sTable[hashS]

			score := func(m match) int {
				// Matches that are longer forward are penalized since we must emit it as a literal.
				ll := m.s - nextEmit
				// Bigger score is better.
				// -m.s indicates the base cost.
				score := m.length - emitLiteralSizeN(ll) - m.s
				offset := m.s - m.offset
				if m.rep {
					return score - emitRepeatSize(m.length)
				}

				if ll > 0 && offset > 1024 {
					// Check for fused discount
					if ll <= maxCopy2Lits && offset < 65536+63 && m.length <= copy2LitMaxLen {
						// 1-4 Literals can be embedded in copy2 without cost.
						score++
					} else if ll <= maxCopy3Lits {
						// 0-3 Literals can be embedded in copy3 without cost.
						score++
					}
				}
				return score - emitCopySize(offset, m.length)
			}

			matchAt := func(offset, s int, first uint32) match {
				if (best.length != 0 && best.s-best.offset == s-offset) || s-offset >= maxCopy3Offset || s <= offset {
					// Don't retest if we have the same offset.
					return match{offset: offset, s: s}
				}
				if debug && s == offset {
					panic(offset)
				}
				if load32(src, offset) != first {
					return match{offset: offset, s: s}
				}

				m := match{offset: offset, s: s, length: 4 + offset, rep: false}
				s += 4

				for s < len(src) {
					if len(src)-s < 8 {
						if src[s] == src[m.length] {
							m.length++
							s++
							continue
						}
						break
					}
					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
						m.length += bits.TrailingZeros64(diff) >> 3
						break
					}
					s += 8
					m.length += 8
				}
				// Extend back...
				for m.s > nextEmit && m.offset > 0 {
					if src[m.offset-1] != src[m.s-1] {
						break
					}
					m.s--
					m.offset--
					m.length++
				}
				m.length -= offset

				m.score = score(m)
				if m.score <= -m.s {
					// Eliminate if no savings, we might find a better one.
					m.length = 0
				}
				if m.s+m.length < sLimit {
					const checkoff = 1
					a, b := m.s+m.length+checkoff, m.offset+m.length+checkoff
					m.nextrep = load32(src, a) == load32(src, b)
				}
				return m
			}
			matchAtRepeat := func(offset, s int, first uint32) match {
				if best.rep {
					// Don't retest if we already have a repeat
					return match{offset: offset, s: s}
				}
				// 2 gives close to no improvement,
				// since it may just give 'literal -> len 2 repeat -> literal' section.
				// which eats up the gains in overhead.
				// 3 gives pretty consistent improvement
				const checkbytes = 3
				mask := uint32((1 << (8 * checkbytes)) - 1)
				if load32(src, offset)&mask != first&mask {
					return match{offset: offset, s: s}
				}
				m := match{offset: offset, s: s, length: checkbytes + offset, rep: true}
				s += checkbytes
				for s < len(src) {
					if len(src)-s < 8 {
						if src[s] == src[m.length] {
							m.length++
							s++
							continue
						}
						break
					}
					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
						m.length += bits.TrailingZeros64(diff) >> 3
						break
					}
					s += 8
					m.length += 8
				}
				// Extend back...
				for m.s > nextEmit && m.offset > 0 {
					if src[m.offset-1] != src[m.s-1] {
						break
					}
					m.s--
					m.offset--
					m.length++
				}
				m.length -= offset
				if m.s+m.length < sLimit {
					const checkoff = 1
					a, b := m.s+m.length+checkoff, m.offset+m.length+checkoff
					m.nextrep = load32(src, a) == load32(src, b)
				}
				m.score = score(m)
				if debug && m.length > 0 && m.length < 3 {
					fmt.Println("repeat", m.length, "offset", m.offset, "s", m.s, "score", m.score, "first", first, "mask", mask, "src", src[m.offset:m.offset+m.length], "src", src[m.s:m.s+m.length])
				}
				return m
			}
			matchDict := func(candidate, s int, first uint32, rep bool) match {
				if s >= maxDictSrcOffset {
					return match{offset: candidate, s: s}
				}
				// Calculate offset as if in continuous array with s
				offset := -len(dict.dict) + candidate
				if best.length != 0 && best.s-best.offset == s-offset && !rep {
					// Don't retest if we have the same offset.
					return match{offset: offset, s: s}
				}

				if load32(dict.dict, candidate) != first {
					return match{offset: offset, s: s}
				}
				m := match{offset: offset, s: s, length: 4 + candidate, rep: rep, dict: true}
				s += 4
				if !rep {
					for s < sLimitDict && m.length < len(dict.dict) {
						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
							if src[s] == dict.dict[m.length] {
								m.length++
								s++
								continue
							}
							break
						}
						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
							m.length += bits.TrailingZeros64(diff) >> 3
							break
						}
						s += 8
						m.length += 8
					}
				} else {
					for s < len(src) && m.length < len(dict.dict) {
						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
							if src[s] == dict.dict[m.length] {
								m.length++
								s++
								continue
							}
							break
						}
						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
							m.length += bits.TrailingZeros64(diff) >> 3
							break
						}
						s += 8
						m.length += 8
					}
				}
				m.length -= candidate
				m.score = score(m)
				if m.score <= -m.s {
					// Eliminate if no savings, we might find a better one.
					m.length = 0
				}
				return m
			}

			bestOf := func(a, b match) match {
				if b.length == 0 {
					return a
				}
				if a.length == 0 {
					return b
				}
				if a.score > b.score {
					return a
				}
				if b.score > a.score {
					return b
				}

				// Pick whichever starts the earliest,
				// we can probably find a match right away
				if a.s != b.s {
					if a.s < b.s {
						return a
					}
					return b
				}
				// If one is a good repeat candidate, pick it.
				if a.nextrep != b.nextrep {
					if a.nextrep {
						return a
					}
					return b
				}
				// Pick the smallest distance offset.
				if a.offset > b.offset {
					return a
				}
				return b
			}

			if s > 0 {
				best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
				best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
				best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
			}
			if dict != nil {
				candidateL := dict.bestTableLong[hashL]
				candidateS := dict.bestTableShort[hashS]
				best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
				best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
				best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
				best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
			}
			{
				if dict == nil || repeat <= s {
					best = bestOf(best, matchAtRepeat(s-repeat, s, uint32(cv)))
					best = bestOf(best, matchAtRepeat(s-repeat+1, s+1, uint32(cv>>8)))
				} else if s-repeat < -4 && dict != nil {
					candidate := len(dict.dict) - (repeat - s)
					best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
					candidate++
					best = bestOf(best, matchDict(candidate, s+1, uint32(cv>>8), true))
				}

				if best.length > 0 {
					hashS := hash4(cv>>8, sTableBits)
					// s+1
					nextShort := sTable[hashS]
					sFwd := s + 1
					cv := load64(src, sFwd)
					hashL := hash8(cv, lTableBits)
					nextLong := lTable[hashL]
					best = bestOf(best, matchAt(getCur(nextShort), sFwd, uint32(cv)))
					best = bestOf(best, matchAt(getPrev(nextShort), sFwd, uint32(cv)))
					best = bestOf(best, matchAt(getCur(nextLong), sFwd, uint32(cv)))
					best = bestOf(best, matchAt(getPrev(nextLong), sFwd, uint32(cv)))

					// dict at + 1
					if dict != nil {
						candidateL := dict.bestTableLong[hashL]
						candidateS := dict.bestTableShort[hashS]

						best = bestOf(best, matchDict(int(candidateL&0xffff), sFwd, uint32(cv), false))
						best = bestOf(best, matchDict(int(candidateS&0xffff), sFwd, uint32(cv), false))
					}

					// s+2
					if true {
						sFwd++
						cv = load64(src, sFwd)
						hashL := hash8(cv, lTableBits)
						nextLong = lTable[hashL]

						if dict == nil || repeat <= sFwd {
							// Repeat at + 2
							best = bestOf(best, matchAtRepeat(sFwd-repeat, sFwd, uint32(cv)))
						} else if repeat-sFwd > 4 && dict != nil {
							candidate := len(dict.dict) - (repeat - sFwd)
							best = bestOf(best, matchDict(candidate, sFwd, uint32(cv), true))
						}
						if true {
							hashS := hash4(cv, sTableBits)
							nextShort = sTable[hashS]
							best = bestOf(best, matchAt(getCur(nextShort), sFwd, uint32(cv)))
							best = bestOf(best, matchAt(getPrev(nextShort), sFwd, uint32(cv)))
						}
						best = bestOf(best, matchAt(getCur(nextLong), sFwd, uint32(cv)))
						best = bestOf(best, matchAt(getPrev(nextLong), sFwd, uint32(cv)))

						// dict at +2
						// Very small gain
						if dict != nil {
							candidateL := dict.bestTableLong[hashL]
							candidateS := dict.bestTableShort[hashS]

							best = bestOf(best, matchDict(int(candidateL&0xffff), sFwd, uint32(cv), false))
							best = bestOf(best, matchDict(int(candidateS&0xffff), sFwd, uint32(cv), false))
						}
					}

					// Search for a match at best match end, see if that is better.
					// Allow some bytes at the beginning to mismatch.
					// Sweet spot is around 1-2 bytes, but depends on input.
					// The skipped bytes are tested in Extend backwards,
					// and still picked up as part of the match if they do.
					const skipBeginning = 2
					const skipEnd = 1
					if sAt := best.s + best.length - skipEnd; sAt < sLimit {

						sBack := best.s + skipBeginning - skipEnd
						backL := best.length - skipBeginning
						// Load initial values
						cv = load64(src, sBack)

						// Grab candidates...
						next := lTable[hash8(load64(src, sAt), lTableBits)]

						if checkAt := getCur(next) - backL; checkAt > 0 {
							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
						}
						if checkAt := getPrev(next) - backL; checkAt > 0 {
							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
						}
						// Quite small gain, but generally a benefit on very compressible material.
						if true {
							next = sTable[hash4(load64(src, sAt), sTableBits)]
							if checkAt := getCur(next) - backL; checkAt > 0 {
								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
							}
							if checkAt := getPrev(next) - backL; checkAt > 0 {
								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
							}
						}
					}
				}
			}

			// Update table
			lTable[hashL] = uint64(s) | candidateL<<32
			sTable[hashS] = uint64(s) | candidateS<<32

			if best.length > 0 {
				break
			}

			cv = load64(src, nextS)
			s = nextS
		}
		startIdx := s + 1
		s = best.s

		if debug && best.offset >= s {
			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
		}
		// Bail if we exceed the maximum size.
		if d+(s-nextEmit) > dstLimit {
			return 0
		}

		base := s
		offset := s - best.offset
		s += best.length
		// Bail if the match is equal or worse to the encoding.
		if !best.rep && best.length <= 4 {
			if offset > 65535 ||
				// Output will almost always be the same, and decoding will be slightly slower.
				// We might find a better match before end of these 4 bytes.
				(offset > maxCopy1Offset && offset <= maxCopy2Offset && base-nextEmit > maxCopy2Lits) {
				s = startIdx + 1
				if s >= sLimit {
					goto emitRemainder
				}
				cv = load64(src, s)
				continue
			}
		}
		if debug && nextEmit != base {
			fmt.Println("EMIT", base-nextEmit, "literals. base-after:", base)
		}

		if best.rep {
			if debug {
				fmt.Println("REPEAT, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
			}
			d += emitLiteral(dst[d:], src[nextEmit:base])
			// same as `d := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
			d += emitRepeat(dst[d:], best.length)
		} else {
			lits := src[nextEmit:base]
			if debug {
				fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best, "lits:", len(lits))
			}
			if len(lits) > 0 {
				if offset <= maxCopy2Offset {
					// 1-2 byte offsets
					if len(lits) > maxCopy2Lits || offset < 64 || (offset <= 1024 && best.length > copy2LitMaxLen) {
						d += emitLiteral(dst[d:], lits)
						if best.length > 18 && best.length <= 64 && offset >= 64 {
							// Size is equal.
							// Prefer Copy2, since it decodes faster
							d += encodeCopy2(dst[d:], offset, best.length)
						} else {
							d += emitCopy(dst[d:], offset, best.length)
						}
					} else {
						if best.length > 11 {
							// We are emitting remaining as a separate repeat.
							// We might as well do a search for a better match.
							d += emitCopyLits2(dst[d:], lits, offset, 11)
							s = best.s + 11
						} else {
							d += emitCopyLits2(dst[d:], lits, offset, best.length)
						}
					}
				} else {
					// 3 byte offset
					if len(lits) > maxCopy3Lits {
						d += emitLiteral(dst[d:], lits)
						d += emitCopy(dst[d:], offset, best.length)
					} else {
						d += emitCopyLits3(dst[d:], lits, offset, best.length)
					}
				}
			} else {
				if best.length > 18 && best.length <= 64 && offset >= 64 && offset <= maxCopy2Offset {
					// Size is equal.
					// Prefer Copy2, since it decodes faster
					d += encodeCopy2(dst[d:], offset, best.length)
				} else {
					d += emitCopy(dst[d:], offset, best.length)
				}
			}
		}
		repeat = offset

		nextEmit = s
		if s >= sLimit {
			goto emitRemainder
		}

		if d > dstLimit {
			// Do we have space for more, if not bail.
			return 0
		}
		// Fill tables...
		for i := startIdx; i < s; i++ {
			cv0 := load64(src, i)
			long0 := hash8(cv0, lTableBits)
			short0 := hash4(cv0, sTableBits)
			lTable[long0] = uint64(i) | lTable[long0]<<32
			sTable[short0] = uint64(i) | sTable[short0]<<32
		}
		cv = load64(src, s)
	}

emitRemainder:
	if nextEmit < len(src) {
		// Bail if we exceed the maximum size.
		litLen := len(src) - nextEmit
		if d+litLen+emitLiteralSizeN(litLen) > dstLimit {
			if debug && nextEmit != s {
				fmt.Println("emitting would exceed dstLimit. Not compressing")
			}
			return 0
		}
		if debug && nextEmit != s {
			fmt.Println("emitted ", len(src)-nextEmit, "literals")
		}
		d += emitLiteral(dst[d:], src[nextEmit:])
	}
	return d
}

// emitCopySize returns the size to encode the offset+length
//
// It assumes that:
//
//	1 <= offset && offset <= math.MaxUint32
//	4 <= length && length <= 1 << 24
func emitCopySize(offset, length int) int {
	if offset > 65536+63 {
		// 3 Byte offset + Variable length (base length 4).
		length -= 64 // Base is free. We can add 64 for free.
		if length <= 0 {
			return 4
		}
		return 4 + (bits.Len(uint(length))+7)/8
	}

	// Offset no more than 2 bytes.
	if offset <= 1024 {
		if length <= 18 {
			// Emit up to 18 bytes with short offset.
			return 2
		}
		if length < 18+256 {
			return 3
		}
		// Worst case we have to emit a repeat for the rest
		return 2 + emitRepeatSize(length-18)
	}
	// 2 byte offset + Variable length (base length 4).
	return emitCopy2Size(length)
}

// emitRepeatSize returns the number of bytes required to encode a repeat.
// Length must be at least 1 and < 1<<24
func emitRepeatSize(length int) int {
	if length <= 0 {
		return 0
	}

	if length <= 29 {
		return 1
	}
	length -= 29
	if length <= 256 {
		return 2
	}
	if length <= 65536 {
		return 3
	}
	return 4
}

// emitCopy2Size returns the number of bytes required to encode a copy2.
// Length must be less than 1<<24
func emitCopy2Size(length int) int {
	length -= 4

	if length <= 60 {
		// Length inside tag.
		return 3
	}
	length -= 60
	if length < 256 {
		// Length in 1 byte.
		return 4
	}
	if length < 65536 {
		// Length in 2 bytes.
		return 5
	}
	// Length in 3 bytes.
	return 6
}