mattermost-community-enterp.../vendor/github.com/advancedlogic/GoOse/cleaner.go
Claude ec1f89217a Merge: Complete Mattermost Server with Community Enterprise
Full Mattermost server source with integrated Community Enterprise features.
Includes vendor directory for offline/air-gapped builds.

Structure:
- enterprise-impl/: Enterprise feature implementations
- enterprise-community/: Init files that register implementations
- enterprise/: Bridge imports (community_imports.go)
- vendor/: All dependencies for offline builds

Build (online):
  go build ./cmd/mattermost

Build (offline/air-gapped):
  go build -mod=vendor ./cmd/mattermost

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 23:59:07 +09:00

573 lines
14 KiB
Go

package goose
import (
"container/list"
"log"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
var whitelistedTextAtomTypes = []atom.Atom{atom.Span, atom.Em, atom.I, atom.Strong, atom.B, atom.P, atom.H1, atom.H2, atom.H3, atom.H4}
var whitelistedExtAtomTypes = []atom.Atom{atom.A, atom.Span, atom.Em, atom.I, atom.Strong, atom.B, atom.P, atom.H1, atom.H2, atom.H3, atom.H4}
// Cleaner removes menus, ads, sidebars, etc. and leaves the main content
type Cleaner struct {
config Configuration
}
// NewCleaner returns a new instance of a Cleaner
func NewCleaner(config Configuration) Cleaner {
return Cleaner{
config: config,
}
}
// replaceTagWithContents removes the tag, replacing it with its text contents
// e.g. "<em>some text</em>" becomes "some text"
func replaceTagWithContents(tagSelection *goquery.Selection, collapsibleAtomTypes []atom.Atom) {
if tagSelection.Length() == 0 {
return
}
node := tagSelection.Get(0)
node.Data = tagSelection.Text()
node.Type = html.TextNode
if node.FirstChild == nil {
node.Attr = []html.Attribute{}
node.DataAtom = 0
node.FirstChild = nil
node.LastChild = nil
} else {
// If all children are text only, the parent already contains the text, so drop them
collapseTextNodes(node, collapsibleAtomTypes)
}
}
func isAtomTypeWhitelisted(t atom.Atom, whitelist []atom.Atom) bool {
for _, allowed := range whitelist {
if t == allowed {
return true
}
}
return false
}
func collapseTextNodes(node *html.Node, collapsibleAtomTypes []atom.Atom) {
if node.FirstChild == nil {
return
}
if !isAtomTypeWhitelisted(node.DataAtom, collapsibleAtomTypes) {
return
}
if node.FirstChild.DataAtom == 0 && node.FirstChild == node.LastChild {
// this tag only contains a single textual node, already contained in the parent
node.Attr = []html.Attribute{}
node.Type = html.TextNode
node.DataAtom = 0
node.FirstChild = nil
node.LastChild = nil
return
}
// If all children are text only, the parent already contains the text, so drop them
allTextNodes := true
for c := node.FirstChild; c != nil; c = c.NextSibling {
// attempt collapsing recursively
collapseTextNodes(c, collapsibleAtomTypes)
if c.DataAtom != 0 {
// not collapsed
allTextNodes = false
break
}
}
if allTextNodes {
// text already contained in the parent node => drop children
node.Attr = []html.Attribute{}
node.Type = html.TextNode
node.DataAtom = 0
node.FirstChild = nil
node.LastChild = nil
}
}
var divToPElementsPattern = regexp.MustCompile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)")
var tabsRegEx = regexp.MustCompile(`\t|^\s+$]`)
var removeVisibilityStyleRegEx = regexp.MustCompile("visibility:[ ]*hidden|display:[ ]*none")
var keepNodesRegEx = regexp.MustCompile(`\b(` +
`article|` + // theguardian.com and newyorker.com (preventing match of "commercial" or "...-ad-...")
`field--label-hidden` + // eff.org (preventing match of "hidden")
`)\b`)
var removeNodesRegEx = regexp.MustCompile("" +
"[Cc]omentario|" +
"[Ff]ooter|" +
"^fn$|" +
"^inset$|" +
"^print$|" +
"^scroll$|" +
"^side$|" +
"^side_|" +
"^widget$|" +
"^ab[0-9]$|" +
"[_-]ads$|" +
"^ad[s]?[ _-]|" +
"[_-]ad[s]?[_-]|" +
"^ADX_CLIENTSIDE$|" +
"ajoutVideo|" +
"^alerts|" +
"^Anchor$|" +
"articleheadings|" +
"_articles|" +
"^article-gallery-embedded$|" +
"author|" +
"author-dropdown|" +
"^banner|" +
"^bar$|" +
"blog-pager|" +
"brass\\-rail|" +
"breadcrumbs|" +
"button|" +
"byline|" +
"cabecalho|" +
"^caption$|" +
"carousel|" +
"^click|" +
"cnnStryHghLght|" +
"cnn_html_slideshow|" +
"cnn_strycaptiontxt|" +
"cnn_strylftcntnt|" +
"cnn_stryspcvbx|" +
"combx|" +
"comment|" +
"commercial|" +
"communitypromo|" +
"^comscore$|" +
"contact|" +
"contentTools2|" +
"controls|" +
"cookie|" +
"CoversMainContent|" +
"^css-|" +
"^critical-alerts$|" +
"^date$|" +
"detail_new_|" +
"downloadLink|" +
"^DYSRC$|" +
"^early-body|" +
"ec_blogs|" +
"^[^entry-]more.*$|" +
"error|" +
"[^-]facebook|" +
"facebook-broadcasting|" +
"^fb-root$|" +
"^feed[_-]|" +
"figcaption|" +
"footnote|" +
"foot|" +
"footer|" +
"^ga-track$|" +
" google |" +
"^gstl_|" +
"^GS-UH$|" +
"^guide$|" +
"header|" +
"hidden|" +
"img_popup_single|" +
"inline-share-tools|" +
"inread|" +
"^interstitial-ad-modal$|" +
"^Inv[0-9]$|" +
"js_replies|" +
"[Kk]ona[Ff]ilter|" +
"^kxhead$|" +
"leading|" +
"^lede[_-]container$|" +
"legende?|" +
"^lightningjs-|" +
"links|" +
"^login-modal$|" +
"^lui-mini-profile-body$|" +
"^marginalia|" +
"^marketing[_-]|" +
"^masthead|" +
"mediaarticlerelated|" +
"^media[_-]viewer$|" +
"menu|" +
"menucontainer|" +
"meta$|" +
"^moat$|" +
"moreNews|" +
"^Moses$|" +
"^nav[_-]|" +
"navbar|" +
"[Nn]avigation|" +
"newsUnder|" +
"^oauth|" +
"^overlay[_-]wrapper|" +
"pagetools|" +
"[_-]paid[_-]|" +
"panelss2|" +
"panesCity|" +
"player|" +
"PopularQuestions|" +
"popup|" +
"post[_-]attributes|" +
"post[_-]title|" +
"preview|" +
"[_-]print[_-]|" +
"products\\-events|" +
"^prop[0-9]$|" +
"^pulse-loaders|" +
"^rail$|" +
"recommend|" +
"^registration-modal$|" +
"relacionado|" +
"related|" +
"remote|" +
"retweet|" +
"^ribbon$|" +
"rightBlock|" +
"rss|" +
"runaroundLeft|" +
"search[_-]|" +
"share[_-]|" +
"shoutbox|" +
"sidebar|" +
"^simplereach$|" +
"^site[_-]index$|" +
"site[_-]box|" +
"site[_-]nav|" +
"skyscraper|" +
"social[Nn]etworking|" +
"social_|" +
"social\\-share|" +
"social\\-count|" +
"socialtools|" +
"source|" +
"^speed-bump-wrapper$|" +
"[_-]spinner$|" +
"^Splash$|" +
"sponsor|" +
"^spr-|" +
"storytopbar\\-bucket|" +
"^stream-sidebar|" +
"sub_nav|" +
"subscribe|" +
"subscription|" +
"^suggestions$|" +
"tabsCity|" +
"tag_|" +
"tags|" +
"teaser|" +
"the_answers|" +
"timestamp|" +
"tools|" +
"tooltip|" +
"^Top[0-9]?$|" +
"^TopAd[0-9]?$|" +
"[_-]track[_-]|" +
"tracking|" +
"[^-]twitter|" +
"-uix-button|" +
"updateBrowser|" +
"^username-modal$|" +
"^user-|" +
"utility-bar|" +
"^vestpocket$|" +
"vcard|" +
"^watch-action-panels$|" +
"^watch-discussion$|" +
"welcome_form|" +
"^whats[_-]next$|" +
"wp-caption-text")
// Clean removes HTML elements around the main content and prepares the document for parsing
func (c *Cleaner) Clean(docToClean *goquery.Document) *goquery.Document {
if c.config.debug {
log.Println("Starting cleaning phase with Cleaner")
}
docToClean = c.cleanBr(docToClean)
docToClean = c.cleanArticleTags(docToClean)
docToClean = c.cleanEMTags(docToClean)
docToClean = c.dropCaps(docToClean)
docToClean = c.removeScriptsStyle(docToClean)
docToClean = c.cleanBadTags(docToClean, keepNodesRegEx, removeNodesRegEx, &[]string{"id", "class", "name"})
docToClean = c.cleanBadTags(docToClean, nil, removeVisibilityStyleRegEx, &[]string{"style"})
docToClean = c.removeTags(docToClean, &[]string{"nav", "footer", "aside", "cite"})
docToClean = c.cleanParaSpans(docToClean)
docToClean = c.convertDivsToParagraphs(docToClean, "div")
docToClean = c.convertDivsToParagraphs(docToClean, "span")
docToClean = c.convertDivsToParagraphs(docToClean, "article")
docToClean = c.convertDivsToParagraphs(docToClean, "pre")
return docToClean
}
func (c *Cleaner) cleanArticleTags(doc *goquery.Document) *goquery.Document {
tags := [3]string{"id", "name", "class"}
articles := doc.Find("article")
articles.Each(func(i int, s *goquery.Selection) {
for _, tag := range tags {
c.config.parser.delAttr(s, tag)
}
})
return doc
}
// replace <br /> with \n\n
func (c *Cleaner) cleanBr(doc *goquery.Document) *goquery.Document {
linebreaks := doc.Find("br")
linebreaks.Each(func(i int, br *goquery.Selection) {
node := br.Get(0)
node.Data = "\n\n"
node.Type = html.TextNode
node.Attr = []html.Attribute{}
node.DataAtom = 0
node.FirstChild = nil
node.LastChild = nil
})
return doc
}
func (c *Cleaner) cleanEMTags(doc *goquery.Document) *goquery.Document {
ems := doc.Find("em")
ems.Each(func(i int, s *goquery.Selection) {
images := s.Find("img")
if images.Length() == 0 {
c.config.parser.dropTag(s)
}
})
if c.config.debug {
log.Printf("Cleaning %d EM tags\n", ems.Size())
}
return doc
}
func (c *Cleaner) removeTags(doc *goquery.Document, tags *[]string) *goquery.Document {
for _, tag := range *tags {
node := doc.Find(tag)
node.Each(func(i int, s *goquery.Selection) {
c.config.parser.removeNode(s)
})
}
return doc
}
func (c *Cleaner) cleanDivs(doc *goquery.Document) *goquery.Document {
frames := make(map[string]int)
framesNodes := make(map[string]*list.List)
divs := doc.Find("div")
divs.Each(func(i int, s *goquery.Selection) {
children := s.Children()
if children.Size() == 0 {
text := strings.Trim(s.Text(), " \t")
text = strings.ToLower(text)
frames[text]++
if framesNodes[text] == nil {
framesNodes[text] = list.New()
}
framesNodes[text].PushBack(s)
}
})
for text, freq := range frames {
if freq > 1 {
selections := framesNodes[text]
for s := selections.Front(); s != nil; s = s.Next() {
selection := s.Value.(*goquery.Selection)
c.config.parser.removeNode(selection)
}
}
}
return doc
}
func (c *Cleaner) dropCaps(doc *goquery.Document) *goquery.Document {
items := doc.Find("span")
count := 0 // remove
items.Each(func(i int, s *goquery.Selection) {
attribute, exists := s.Attr("class")
if exists && (strings.Contains(attribute, "dropcap") || strings.Contains(attribute, "drop_cap")) {
c.config.parser.dropTag(s)
count++
}
})
if c.config.debug && count > 0 {
log.Printf("Cleaned %d dropcap tags\n", count)
}
return doc
}
func (c *Cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document {
if c.config.debug {
log.Println("Starting to remove script tags")
}
count := 0 // number of removed nodes
scripts := doc.Find("script,noscript,style")
scripts.Each(func(i int, s *goquery.Selection) {
c.config.parser.removeNode(s)
count++
})
if c.config.debug && count > 0 {
log.Printf("Removed %d script and style tags\n", scripts.Size())
}
return doc
}
func (c *Cleaner) cleanBadTags(doc *goquery.Document, keepPattern *regexp.Regexp, pattern *regexp.Regexp, selectors *[]string) *goquery.Document {
body := doc.Find("html")
children := body.Children()
children.Each(func(i int, s *goquery.Selection) {
for _, selector := range *selectors {
naughtyList := s.Find("*[" + selector + "]")
count := 0
naughtyList.Each(func(j int, node *goquery.Selection) {
attribute, _ := node.Attr(selector)
if (keepPattern == nil || !keepPattern.MatchString(attribute)) && pattern.MatchString(attribute) {
if c.config.debug {
log.Printf("Cleaning: Removing node with %s: %s => matched %s\n", selector, c.config.parser.name(selector, node), strings.Join(pattern.FindAllString(attribute, 100), ", "))
}
c.config.parser.removeNode(node)
count++
}
})
if c.config.debug && count > 0 {
log.Printf("%d naughty %s elements found", count, selector)
}
}
})
return doc
}
// Replace <p><span>...</span></p> with <p>...</p>
func (c *Cleaner) cleanParaSpans(doc *goquery.Document) *goquery.Document {
spans := doc.Find("span")
spans.Each(func(i int, s *goquery.Selection) {
parent := s.Parent()
if parent != nil && parent.Length() > 0 && parent.Get(0).DataAtom == atom.P {
replaceTagWithContents(s, whitelistedTextAtomTypes)
}
})
return doc
}
func (c *Cleaner) getFlushedBuffer(fragment string) []*html.Node {
var output []*html.Node
reader := strings.NewReader(fragment)
document, _ := html.Parse(reader)
body := document.FirstChild.LastChild
for c := body.FirstChild; c != nil; c = c.NextSibling {
output = append(output, c)
c.Parent = nil
c.PrevSibling = nil
}
for _, o := range output {
o.NextSibling = nil
}
return output
}
func (c *Cleaner) replaceWithPara(div *goquery.Selection) {
if div.Size() > 0 {
node := div.Get(0)
node.Data = atom.P.String()
node.DataAtom = atom.P
node.Attr = []html.Attribute{}
}
}
func (c *Cleaner) tabsAndNewLinesReplacements(text string) string {
text = strings.Replace(text, "\n", "\n\n", -1)
text = tabsRegEx.ReplaceAllString(text, "")
return text
}
func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
if c.config.debug {
log.Println("Starting to replace bad divs...")
}
badDivs := 0
convertedTextNodes := 0
divs := doc.Find(domType)
divs.Each(func(i int, div *goquery.Selection) {
divHTML, _ := div.Html()
if divToPElementsPattern.Match([]byte(divHTML)) {
c.replaceWithPara(div)
badDivs++
} else {
var replacementText []string
nodesToRemove := list.New()
children := div.Contents()
if c.config.debug {
log.Printf("Found %d children of div\n", children.Size())
}
children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
text := kid.Text()
kidNode := kid.Get(0)
tag := kidNode.Data
if tag == text {
tag = "#text"
}
if tag == "#text" {
text = strings.Replace(text, "\n", "", -1)
text = tabsRegEx.ReplaceAllString(text, "")
if text == "" {
return true
}
if len(text) > 1 {
prev := kidNode.PrevSibling
if c.config.debug {
log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag)
log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
}
if prev != nil && prev.DataAtom == atom.A {
nodeSelection := kid.HasNodes(prev)
html, _ := nodeSelection.Html()
replacementText = append(replacementText, html)
if c.config.debug {
log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
}
}
replacementText = append(replacementText, text)
nodesToRemove.PushBack(kidNode)
convertedTextNodes++
}
}
return true
})
/*
newNode := new(html.Node)
newNode.Type = html.ElementNode
newNode.Data = strings.Join(replacementText, "")
newNode.DataAtom = atom.P
*/
/*
replacementText = strings.Replace(replacementText, "=C3=A8", "è")
replacementText = strings.Replace(replacementText, "=C3=A9", "é")
*/
div.First().BeforeHtml("<p>" + strings.Join(replacementText, "") + "</p>")
for s := nodesToRemove.Front(); s != nil; s = s.Next() {
node := s.Value.(*html.Node)
if node != nil && node.Parent != nil {
node.Parent.RemoveChild(node)
}
}
}
})
if c.config.debug {
log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
}
return doc
}