Full Mattermost server source with integrated Community Enterprise features. Includes vendor directory for offline/air-gapped builds. Structure: - enterprise-impl/: Enterprise feature implementations - enterprise-community/: Init files that register implementations - enterprise/: Bridge imports (community_imports.go) - vendor/: All dependencies for offline builds Build (online): go build ./cmd/mattermost Build (offline/air-gapped): go build -mod=vendor ./cmd/mattermost 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
181 lines
4.6 KiB
Go
181 lines
4.6 KiB
Go
package goose
|
|
|
|
import (
|
|
"bytes"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var normalizeWhitespaceRegexp = regexp.MustCompile(`[ \r\f\v\t]+`)
|
|
var normalizeNl = regexp.MustCompile(`[\n]+`)
|
|
var validURLRegex = regexp.MustCompile("^http[s]?://")
|
|
|
|
type outputFormatter struct {
|
|
topNode *goquery.Selection
|
|
config Configuration
|
|
language string
|
|
}
|
|
|
|
func (formatter *outputFormatter) getLanguage(lang string) string {
|
|
if formatter.config.useMetaLanguage && "" != lang {
|
|
return lang
|
|
}
|
|
return formatter.config.targetLanguage
|
|
}
|
|
|
|
func (formatter *outputFormatter) getTopNode() *goquery.Selection {
|
|
return formatter.topNode
|
|
}
|
|
|
|
func (formatter *outputFormatter) getFormattedText(topNode *goquery.Selection, lang string) (output string, links []string) {
|
|
formatter.topNode = topNode
|
|
formatter.language = formatter.getLanguage(lang)
|
|
if formatter.language == "" {
|
|
formatter.language = formatter.config.targetLanguage
|
|
}
|
|
formatter.removeNegativescoresNodes()
|
|
links = formatter.linksToText()
|
|
formatter.replaceTagsWithText()
|
|
formatter.removeParagraphsWithFewWords()
|
|
|
|
output = formatter.getOutputText()
|
|
return output, links
|
|
}
|
|
|
|
func (formatter *outputFormatter) convertToText() string {
|
|
var txts []string
|
|
selections := formatter.topNode
|
|
selections.Each(func(i int, s *goquery.Selection) {
|
|
txt := s.Text()
|
|
if txt != "" {
|
|
// txt = txt //unescape
|
|
txtLis := strings.Trim(txt, "\n")
|
|
txts = append(txts, txtLis)
|
|
}
|
|
})
|
|
return strings.Join(txts, "\n\n")
|
|
}
|
|
|
|
// check if this is a valid URL
|
|
func isValidURL(u string) bool {
|
|
return validURLRegex.MatchString(u)
|
|
}
|
|
|
|
func (formatter *outputFormatter) linksToText() []string {
|
|
var urlList []string
|
|
links := formatter.topNode.Find("a")
|
|
links.Each(func(i int, a *goquery.Selection) {
|
|
imgs := a.Find("img")
|
|
// ignore linked images
|
|
if imgs.Length() == 0 {
|
|
// save a list of URLs
|
|
url, _ := a.Attr("href")
|
|
if isValidURL(url) {
|
|
urlList = append(urlList, url)
|
|
}
|
|
// replace <a> tag with its text contents
|
|
replaceTagWithContents(a, whitelistedExtAtomTypes)
|
|
|
|
// see whether we can collapse the parent node now
|
|
replaceTagWithContents(a.Parent(), whitelistedTextAtomTypes)
|
|
}
|
|
})
|
|
|
|
return urlList
|
|
}
|
|
|
|
// Text gets the combined text contents of each element in the set of matched
|
|
// elements, including their descendants.
|
|
//
|
|
// @see https://github.com/PuerkitoBio/goquery/blob/master/property.go
|
|
func (formatter *outputFormatter) Text(s *goquery.Selection) string {
|
|
var buf bytes.Buffer
|
|
|
|
// Slightly optimized vs calling Each: no single selection object created
|
|
var f func(*html.Node)
|
|
f = func(n *html.Node) {
|
|
if n.Type == html.TextNode && 0 == n.DataAtom { // NB: had to add the DataAtom check to avoid printing text twice when a textual node embeds another textual node
|
|
// Keep newlines and spaces, like jQuery
|
|
buf.WriteString(n.Data)
|
|
}
|
|
if n.FirstChild != nil {
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
f(c)
|
|
}
|
|
}
|
|
}
|
|
for _, n := range s.Nodes {
|
|
f(n)
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func (formatter *outputFormatter) getOutputText() string {
|
|
//out := formatter.topNode.Text()
|
|
out := formatter.Text(formatter.topNode)
|
|
out = normalizeWhitespaceRegexp.ReplaceAllString(out, " ")
|
|
|
|
strArr := strings.Split(out, "\n")
|
|
resArr := []string{}
|
|
|
|
for i, v := range strArr {
|
|
v = strings.TrimSpace(v)
|
|
if v != "" {
|
|
resArr = append(resArr, v)
|
|
} else if i > 2 && strArr[i-2] != "" {
|
|
resArr = append(resArr, "")
|
|
}
|
|
}
|
|
|
|
out = strings.Join(resArr, "\n")
|
|
out = normalizeNl.ReplaceAllString(out, "\n\n")
|
|
|
|
out = strings.TrimSpace(out)
|
|
return out
|
|
}
|
|
|
|
func (formatter *outputFormatter) removeNegativescoresNodes() {
|
|
gravityItems := formatter.topNode.Find("*[gravityScore]")
|
|
gravityItems.Each(func(i int, s *goquery.Selection) {
|
|
var score int
|
|
sscore, exists := s.Attr("gravityScore")
|
|
if exists {
|
|
score, _ = strconv.Atoi(sscore)
|
|
if score < 1 {
|
|
sNode := s.Get(0)
|
|
sNode.Parent.RemoveChild(sNode)
|
|
}
|
|
}
|
|
|
|
})
|
|
}
|
|
|
|
func (formatter *outputFormatter) replaceTagsWithText() {
|
|
for _, tag := range []string{"em", "strong", "b", "i", "span", "h1", "h2", "h3", "h4"} {
|
|
nodes := formatter.topNode.Find(tag)
|
|
nodes.Each(func(i int, node *goquery.Selection) {
|
|
replaceTagWithContents(node, whitelistedTextAtomTypes)
|
|
})
|
|
}
|
|
}
|
|
|
|
func (formatter *outputFormatter) removeParagraphsWithFewWords() {
|
|
language := formatter.language
|
|
if language == "" {
|
|
language = "en"
|
|
}
|
|
allNodes := formatter.topNode.Children()
|
|
allNodes.Each(func(i int, s *goquery.Selection) {
|
|
sw := formatter.config.stopWords.stopWordsCount(language, s.Text())
|
|
if sw.wordCount < 5 && s.Find("object").Length() == 0 && s.Find("em").Length() == 0 {
|
|
node := s.Get(0)
|
|
node.Parent.RemoveChild(node)
|
|
}
|
|
})
|
|
}
|