mattermost-community-enterp.../vendor/github.com/advancedlogic/GoOse/cleaner.go

package goose

import (
	"container/list"
	"log"
	"regexp"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

var whitelistedTextAtomTypes = []atom.Atom{atom.Span, atom.Em, atom.I, atom.Strong, atom.B, atom.P, atom.H1, atom.H2, atom.H3, atom.H4}
var whitelistedExtAtomTypes = []atom.Atom{atom.A, atom.Span, atom.Em, atom.I, atom.Strong, atom.B, atom.P, atom.H1, atom.H2, atom.H3, atom.H4}

// Cleaner removes menus, ads, sidebars, etc. and leaves the main content
type Cleaner struct {
	config Configuration
}

// NewCleaner returns a new instance of a Cleaner
func NewCleaner(config Configuration) Cleaner {
	return Cleaner{
		config: config,
	}
}

// replaceTagWithContents removes the tag, replacing it with its text contents
// e.g. "<em>some text</em>" becomes "some text"
func replaceTagWithContents(tagSelection *goquery.Selection, collapsibleAtomTypes []atom.Atom) {
	if tagSelection.Length() == 0 {
		return
	}
	node := tagSelection.Get(0)
	node.Data = tagSelection.Text()
	node.Type = html.TextNode
	if node.FirstChild == nil {
		node.Attr = []html.Attribute{}
		node.DataAtom = 0
		node.FirstChild = nil
		node.LastChild = nil
	} else {
		// If all children are text only, the parent already contains the text, so drop them
		collapseTextNodes(node, collapsibleAtomTypes)
	}
}

func isAtomTypeWhitelisted(t atom.Atom, whitelist []atom.Atom) bool {
	for _, allowed := range whitelist {
		if t == allowed {
			return true
		}
	}
	return false
}

func collapseTextNodes(node *html.Node, collapsibleAtomTypes []atom.Atom) {
	if node.FirstChild == nil {
		return
	}

	if !isAtomTypeWhitelisted(node.DataAtom, collapsibleAtomTypes) {
		return
	}

	if node.FirstChild.DataAtom == 0 && node.FirstChild == node.LastChild {
		// this tag only contains a single textual node, already contained in the parent
		node.Attr = []html.Attribute{}
		node.Type = html.TextNode
		node.DataAtom = 0
		node.FirstChild = nil
		node.LastChild = nil
		return
	}

	// If all children are text only, the parent already contains the text, so drop them
	allTextNodes := true
	for c := node.FirstChild; c != nil; c = c.NextSibling {
		// attempt collapsing recursively
		collapseTextNodes(c, collapsibleAtomTypes)
		if c.DataAtom != 0 {
			// not collapsed
			allTextNodes = false
			break
		}
	}
	if allTextNodes {
		// text already contained in the parent node => drop children
		node.Attr = []html.Attribute{}
		node.Type = html.TextNode
		node.DataAtom = 0
		node.FirstChild = nil
		node.LastChild = nil
	}
}

var divToPElementsPattern = regexp.MustCompile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)")
var tabsRegEx = regexp.MustCompile(`\t|^\s+$]`)
var removeVisibilityStyleRegEx = regexp.MustCompile("visibility:[ ]*hidden|display:[ ]*none")
var keepNodesRegEx = regexp.MustCompile(`\b(` +
	`article|` + // theguardian.com and newyorker.com (preventing match of "commercial" or "...-ad-...")
	`field--label-hidden` + // eff.org (preventing match of "hidden")
	`)\b`)
var removeNodesRegEx = regexp.MustCompile("" +
	"[Cc]omentario|" +
	"[Ff]ooter|" +
	"^fn$|" +
	"^inset$|" +
	"^print$|" +
	"^scroll$|" +
	"^side$|" +
	"^side_|" +
	"^widget$|" +
	"^ab[0-9]$|" +
	"[_-]ads$|" +
	"^ad[s]?[ _-]|" +
	"[_-]ad[s]?[_-]|" +
	"^ADX_CLIENTSIDE$|" +
	"ajoutVideo|" +
	"^alerts|" +
	"^Anchor$|" +
	"articleheadings|" +
	"_articles|" +
	"^article-gallery-embedded$|" +
	"author|" +
	"author-dropdown|" +
	"^banner|" +
	"^bar$|" +
	"blog-pager|" +
	"brass\\-rail|" +
	"breadcrumbs|" +
	"button|" +
	"byline|" +
	"cabecalho|" +
	"^caption$|" +
	"carousel|" +
	"^click|" +
	"cnnStryHghLght|" +
	"cnn_html_slideshow|" +
	"cnn_strycaptiontxt|" +
	"cnn_strylftcntnt|" +
	"cnn_stryspcvbx|" +
	"combx|" +
	"comment|" +
	"commercial|" +
	"communitypromo|" +
	"^comscore$|" +
	"contact|" +
	"contentTools2|" +
	"controls|" +
	"cookie|" +
	"CoversMainContent|" +
	"^css-|" +
	"^critical-alerts$|" +
	"^date$|" +
	"detail_new_|" +
	"downloadLink|" +
	"^DYSRC$|" +
	"^early-body|" +
	"ec_blogs|" +
	"^[^entry-]more.*$|" +
	"error|" +
	"[^-]facebook|" +
	"facebook-broadcasting|" +
	"^fb-root$|" +
	"^feed[_-]|" +
	"figcaption|" +
	"footnote|" +
	"foot|" +
	"footer|" +
	"^ga-track$|" +
	" google |" +
	"^gstl_|" +
	"^GS-UH$|" +
	"^guide$|" +
	"header|" +
	"hidden|" +
	"img_popup_single|" +
	"inline-share-tools|" +
	"inread|" +
	"^interstitial-ad-modal$|" +
	"^Inv[0-9]$|" +
	"js_replies|" +
	"[Kk]ona[Ff]ilter|" +
	"^kxhead$|" +
	"leading|" +
	"^lede[_-]container$|" +
	"legende?|" +
	"^lightningjs-|" +
	"links|" +
	"^login-modal$|" +
	"^lui-mini-profile-body$|" +
	"^marginalia|" +
	"^marketing[_-]|" +
	"^masthead|" +
	"mediaarticlerelated|" +
	"^media[_-]viewer$|" +
	"menu|" +
	"menucontainer|" +
	"meta$|" +
	"^moat$|" +
	"moreNews|" +
	"^Moses$|" +
	"^nav[_-]|" +
	"navbar|" +
	"[Nn]avigation|" +
	"newsUnder|" +
	"^oauth|" +
	"^overlay[_-]wrapper|" +
	"pagetools|" +
	"[_-]paid[_-]|" +
	"panelss2|" +
	"panesCity|" +
	"player|" +
	"PopularQuestions|" +
	"popup|" +
	"post[_-]attributes|" +
	"post[_-]title|" +
	"preview|" +
	"[_-]print[_-]|" +
	"products\\-events|" +
	"^prop[0-9]$|" +
	"^pulse-loaders|" +
	"^rail$|" +
	"recommend|" +
	"^registration-modal$|" +
	"relacionado|" +
	"related|" +
	"remote|" +
	"retweet|" +
	"^ribbon$|" +
	"rightBlock|" +
	"rss|" +
	"runaroundLeft|" +
	"search[_-]|" +
	"share[_-]|" +
	"shoutbox|" +
	"sidebar|" +
	"^simplereach$|" +
	"^site[_-]index$|" +
	"site[_-]box|" +
	"site[_-]nav|" +
	"skyscraper|" +
	"social[Nn]etworking|" +
	"social_|" +
	"social\\-share|" +
	"social\\-count|" +
	"socialtools|" +
	"source|" +
	"^speed-bump-wrapper$|" +
	"[_-]spinner$|" +
	"^Splash$|" +
	"sponsor|" +
	"^spr-|" +
	"storytopbar\\-bucket|" +
	"^stream-sidebar|" +
	"sub_nav|" +
	"subscribe|" +
	"subscription|" +
	"^suggestions$|" +
	"tabsCity|" +
	"tag_|" +
	"tags|" +
	"teaser|" +
	"the_answers|" +
	"timestamp|" +
	"tools|" +
	"tooltip|" +
	"^Top[0-9]?$|" +
	"^TopAd[0-9]?$|" +
	"[_-]track[_-]|" +
	"tracking|" +
	"[^-]twitter|" +
	"-uix-button|" +
	"updateBrowser|" +
	"^username-modal$|" +
	"^user-|" +
	"utility-bar|" +
	"^vestpocket$|" +
	"vcard|" +
	"^watch-action-panels$|" +
	"^watch-discussion$|" +
	"welcome_form|" +
	"^whats[_-]next$|" +
	"wp-caption-text")

// Clean removes HTML elements around the main content and prepares the document for parsing
func (c *Cleaner) Clean(docToClean *goquery.Document) *goquery.Document {
	if c.config.debug {
		log.Println("Starting cleaning phase with Cleaner")
	}
	docToClean = c.cleanBr(docToClean)
	docToClean = c.cleanArticleTags(docToClean)
	docToClean = c.cleanEMTags(docToClean)
	docToClean = c.dropCaps(docToClean)
	docToClean = c.removeScriptsStyle(docToClean)
	docToClean = c.cleanBadTags(docToClean, keepNodesRegEx, removeNodesRegEx, &[]string{"id", "class", "name"})
	docToClean = c.cleanBadTags(docToClean, nil, removeVisibilityStyleRegEx, &[]string{"style"})
	docToClean = c.removeTags(docToClean, &[]string{"nav", "footer", "aside", "cite"})
	docToClean = c.cleanParaSpans(docToClean)

	docToClean = c.convertDivsToParagraphs(docToClean, "div")

	docToClean = c.convertDivsToParagraphs(docToClean, "span")
	docToClean = c.convertDivsToParagraphs(docToClean, "article")
	docToClean = c.convertDivsToParagraphs(docToClean, "pre")

	return docToClean
}

func (c *Cleaner) cleanArticleTags(doc *goquery.Document) *goquery.Document {
	tags := [3]string{"id", "name", "class"}
	articles := doc.Find("article")
	articles.Each(func(i int, s *goquery.Selection) {
		for _, tag := range tags {
			c.config.parser.delAttr(s, tag)
		}
	})
	return doc
}

// replace <br /> with \n\n
func (c *Cleaner) cleanBr(doc *goquery.Document) *goquery.Document {
	linebreaks := doc.Find("br")
	linebreaks.Each(func(i int, br *goquery.Selection) {
		node := br.Get(0)
		node.Data = "\n\n"
		node.Type = html.TextNode
		node.Attr = []html.Attribute{}
		node.DataAtom = 0
		node.FirstChild = nil
		node.LastChild = nil
	})
	return doc
}

func (c *Cleaner) cleanEMTags(doc *goquery.Document) *goquery.Document {
	ems := doc.Find("em")
	ems.Each(func(i int, s *goquery.Selection) {
		images := s.Find("img")
		if images.Length() == 0 {
			c.config.parser.dropTag(s)
		}
	})
	if c.config.debug {
		log.Printf("Cleaning %d EM tags\n", ems.Size())
	}
	return doc
}

func (c *Cleaner) removeTags(doc *goquery.Document, tags *[]string) *goquery.Document {
	for _, tag := range *tags {
		node := doc.Find(tag)
		node.Each(func(i int, s *goquery.Selection) {
			c.config.parser.removeNode(s)
		})
	}
	return doc
}

func (c *Cleaner) cleanDivs(doc *goquery.Document) *goquery.Document {
	frames := make(map[string]int)
	framesNodes := make(map[string]*list.List)
	divs := doc.Find("div")
	divs.Each(func(i int, s *goquery.Selection) {
		children := s.Children()
		if children.Size() == 0 {
			text := strings.Trim(s.Text(), " \t")
			text = strings.ToLower(text)
			frames[text]++
			if framesNodes[text] == nil {
				framesNodes[text] = list.New()
			}
			framesNodes[text].PushBack(s)
		}
	})
	for text, freq := range frames {
		if freq > 1 {
			selections := framesNodes[text]
			for s := selections.Front(); s != nil; s = s.Next() {
				selection := s.Value.(*goquery.Selection)
				c.config.parser.removeNode(selection)
			}
		}
	}
	return doc
}

func (c *Cleaner) dropCaps(doc *goquery.Document) *goquery.Document {
	items := doc.Find("span")
	count := 0 // remove
	items.Each(func(i int, s *goquery.Selection) {
		attribute, exists := s.Attr("class")
		if exists && (strings.Contains(attribute, "dropcap") || strings.Contains(attribute, "drop_cap")) {
			c.config.parser.dropTag(s)
			count++
		}
	})
	if c.config.debug && count > 0 {
		log.Printf("Cleaned %d dropcap tags\n", count)
	}
	return doc
}

func (c *Cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document {
	if c.config.debug {
		log.Println("Starting to remove script tags")
	}
	count := 0 // number of removed nodes
	scripts := doc.Find("script,noscript,style")
	scripts.Each(func(i int, s *goquery.Selection) {
		c.config.parser.removeNode(s)
		count++
	})
	if c.config.debug && count > 0 {
		log.Printf("Removed %d script and style tags\n", scripts.Size())
	}
	return doc
}

func (c *Cleaner) cleanBadTags(doc *goquery.Document, keepPattern *regexp.Regexp, pattern *regexp.Regexp, selectors *[]string) *goquery.Document {
	body := doc.Find("html")
	children := body.Children()
	children.Each(func(i int, s *goquery.Selection) {
		for _, selector := range *selectors {
			naughtyList := s.Find("*[" + selector + "]")
			count := 0
			naughtyList.Each(func(j int, node *goquery.Selection) {
				attribute, _ := node.Attr(selector)
				if (keepPattern == nil || !keepPattern.MatchString(attribute)) && pattern.MatchString(attribute) {
					if c.config.debug {
						log.Printf("Cleaning: Removing node with %s: %s => matched %s\n", selector, c.config.parser.name(selector, node), strings.Join(pattern.FindAllString(attribute, 100), ", "))
					}
					c.config.parser.removeNode(node)
					count++
				}
			})
			if c.config.debug && count > 0 {
				log.Printf("%d naughty %s elements found", count, selector)
			}
		}
	})
	return doc
}

// Replace <p><span>...</span></p>   with  <p>...</p>
func (c *Cleaner) cleanParaSpans(doc *goquery.Document) *goquery.Document {
	spans := doc.Find("span")
	spans.Each(func(i int, s *goquery.Selection) {
		parent := s.Parent()
		if parent != nil && parent.Length() > 0 && parent.Get(0).DataAtom == atom.P {
			replaceTagWithContents(s, whitelistedTextAtomTypes)
		}
	})
	return doc
}

func (c *Cleaner) getFlushedBuffer(fragment string) []*html.Node {
	var output []*html.Node
	reader := strings.NewReader(fragment)
	document, _ := html.Parse(reader)
	body := document.FirstChild.LastChild
	for c := body.FirstChild; c != nil; c = c.NextSibling {
		output = append(output, c)
		c.Parent = nil
		c.PrevSibling = nil
	}

	for _, o := range output {
		o.NextSibling = nil
	}
	return output
}

func (c *Cleaner) replaceWithPara(div *goquery.Selection) {
	if div.Size() > 0 {
		node := div.Get(0)
		node.Data = atom.P.String()
		node.DataAtom = atom.P
		node.Attr = []html.Attribute{}
	}
}

func (c *Cleaner) tabsAndNewLinesReplacements(text string) string {
	text = strings.Replace(text, "\n", "\n\n", -1)
	text = tabsRegEx.ReplaceAllString(text, "")
	return text
}

func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
	if c.config.debug {
		log.Println("Starting to replace bad divs...")
	}
	badDivs := 0
	convertedTextNodes := 0
	divs := doc.Find(domType)

	divs.Each(func(i int, div *goquery.Selection) {
		divHTML, _ := div.Html()
		if divToPElementsPattern.Match([]byte(divHTML)) {
			c.replaceWithPara(div)
			badDivs++
		} else {
			var replacementText []string
			nodesToRemove := list.New()
			children := div.Contents()
			if c.config.debug {
				log.Printf("Found %d children of div\n", children.Size())
			}
			children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
				text := kid.Text()
				kidNode := kid.Get(0)
				tag := kidNode.Data
				if tag == text {
					tag = "#text"
				}
				if tag == "#text" {
					text = strings.Replace(text, "\n", "", -1)
					text = tabsRegEx.ReplaceAllString(text, "")
					if text == "" {
						return true
					}
					if len(text) > 1 {
						prev := kidNode.PrevSibling
						if c.config.debug {
							log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag)
							log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
						}
						if prev != nil && prev.DataAtom == atom.A {
							nodeSelection := kid.HasNodes(prev)
							html, _ := nodeSelection.Html()
							replacementText = append(replacementText, html)
							if c.config.debug {
								log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
							}
						}
						replacementText = append(replacementText, text)
						nodesToRemove.PushBack(kidNode)
						convertedTextNodes++
					}

				}
				return true
			})

			/*
				newNode := new(html.Node)
				newNode.Type = html.ElementNode
				newNode.Data = strings.Join(replacementText, "")
				newNode.DataAtom = atom.P
			*/
			/*
				replacementText = strings.Replace(replacementText, "=C3=A8", "è")
				replacementText = strings.Replace(replacementText, "=C3=A9", "é")
			*/
			div.First().BeforeHtml("<p>" + strings.Join(replacementText, "") + "</p>")

			for s := nodesToRemove.Front(); s != nil; s = s.Next() {
				node := s.Value.(*html.Node)
				if node != nil && node.Parent != nil {
					node.Parent.RemoveChild(node)
				}
			}
		}
	})
	if c.config.debug {
		log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
	}
	return doc

}