mattermost-community-enterp.../vendor/github.com/advancedlogic/GoOse/extractor.go
Claude ec1f89217a Merge: Complete Mattermost Server with Community Enterprise
Full Mattermost server source with integrated Community Enterprise features.
Includes vendor directory for offline/air-gapped builds.

Structure:
- enterprise-impl/: Enterprise feature implementations
- enterprise-community/: Init files that register implementations
- enterprise/: Bridge imports (community_imports.go)
- vendor/: All dependencies for offline builds

Build (online):
  go build ./cmd/mattermost

Build (offline/air-gapped):
  go build -mod=vendor ./cmd/mattermost

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 23:59:07 +09:00

755 lines
22 KiB
Go

package goose
import (
"container/list"
"log"
"math"
"net/url"
"regexp"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/araddon/dateparse"
"github.com/fatih/set"
"github.com/gigawattio/window"
"github.com/jaytaylor/html2text"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
const defaultLanguage = "en"
var motleyReplacement = "&#65533;" // U+FFFD (decimal 65533) is the "replacement character".
//var escapedFragmentReplacement = regexp.MustCompile("#!")
//var titleReplacements = regexp.MustCompile("&raquo;")
var titleDelimiters = []string{
"|",
" - ",
" — ",
"»",
":",
}
var aRelTagSelector = "a[rel=tag]"
var aHrefTagSelector = [...]string{"/tag/", "/tags/", "/topic/", "?keyword"}
//var langRegEx = "^[A-Za-z]{2}$"
// ContentExtractor can parse the HTML and fetch various properties
type ContentExtractor struct {
config Configuration
}
// NewExtractor returns a configured HTML parser
func NewExtractor(config Configuration) ContentExtractor {
return ContentExtractor{
config: config,
}
}
//if the article has a title set in the source, use that
func (extr *ContentExtractor) getTitleUnmodified(document *goquery.Document) string {
title := ""
titleElement := document.Find("title")
if titleElement != nil && titleElement.Size() > 0 {
title = titleElement.Text()
}
if title == "" {
ogTitleElement := document.Find(`meta[property="og:title"]`)
if ogTitleElement != nil && ogTitleElement.Size() > 0 {
title, _ = ogTitleElement.Attr("content")
}
}
if title == "" {
titleElement = document.Find("post-title,headline")
if titleElement == nil || titleElement.Size() == 0 {
return title
}
title = titleElement.Text()
}
return title
}
// GetTitleFromUnmodifiedTitle returns the title from the unmodified one
func (extr *ContentExtractor) GetTitleFromUnmodifiedTitle(title string) string {
for _, delimiter := range titleDelimiters {
if strings.Contains(title, delimiter) {
title = extr.splitTitle(strings.Split(title, delimiter))
break
}
}
title = strings.Replace(title, motleyReplacement, "", -1)
if extr.config.debug {
log.Printf("Page title is %s\n", title)
}
return strings.TrimSpace(title)
}
// GetTitle returns the title set in the source, if the article has one
func (extr *ContentExtractor) GetTitle(document *goquery.Document) string {
title := extr.getTitleUnmodified(document)
return extr.GetTitleFromUnmodifiedTitle(title)
}
func (extr *ContentExtractor) splitTitle(titles []string) string {
largeTextLength := 0
largeTextIndex := 0
for i, current := range titles {
if len(current) > largeTextLength {
largeTextLength = len(current)
largeTextIndex = i
}
}
title := titles[largeTextIndex]
title = strings.Replace(title, "&raquo;", "»", -1)
return title
}
// GetMetaLanguage returns the meta language set in the source, if the article has one
func (extr *ContentExtractor) GetMetaLanguage(document *goquery.Document) string {
var language string
shtml := document.Find("html")
attr, _ := shtml.Attr("lang")
if attr == "" {
attr, _ = document.Attr("lang")
}
if attr == "" {
selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
var exists bool
attr, exists = s.Attr("http-equiv")
if exists && attr == "content-language" {
return false
}
return true
})
if selection != nil {
attr, _ = selection.Attr("content")
}
}
idx := strings.LastIndex(attr, "-")
if idx == -1 {
language = attr
} else {
language = attr[0:idx]
}
_, ok := sw[language]
if language == "" || !ok {
language = extr.config.stopWords.SimpleLanguageDetector(shtml.Text())
if language == "" {
language = defaultLanguage
}
}
extr.config.targetLanguage = language
return language
}
// GetFavicon returns the favicon set in the source, if the article has one
func (extr *ContentExtractor) GetFavicon(document *goquery.Document) string {
favicon := ""
document.Find("link").EachWithBreak(func(i int, s *goquery.Selection) bool {
attr, exists := s.Attr("rel")
if exists && strings.Contains(attr, "icon") {
favicon, _ = s.Attr("href")
return false
}
return true
})
return favicon
}
// GetMetaContentWithSelector returns the content attribute of meta tag matching the selector
func (extr *ContentExtractor) GetMetaContentWithSelector(document *goquery.Document, selector string) string {
selection := document.Find(selector)
content, _ := selection.Attr("content")
return strings.TrimSpace(content)
}
// GetMetaContent returns the content attribute of meta tag with the given property name
func (extr *ContentExtractor) GetMetaContent(document *goquery.Document, metaName string) string {
content := ""
document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
attr, exists := s.Attr("name")
if exists && attr == metaName {
content, _ = s.Attr("content")
return false
}
attr, exists = s.Attr("itemprop")
if exists && attr == metaName {
content, _ = s.Attr("content")
return false
}
return true
})
return content
}
// GetMetaContents returns all the meta tags as name->content pairs
func (extr *ContentExtractor) GetMetaContents(document *goquery.Document, metaNames *set.Set) map[string]string {
contents := make(map[string]string)
counter := metaNames.Size()
document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
attr, exists := s.Attr("name")
if exists && metaNames.Has(attr) {
content, _ := s.Attr("content")
contents[attr] = content
counter--
if counter < 0 {
return false
}
}
return true
})
return contents
}
// GetMetaDescription returns the meta description set in the source, if the article has one
func (extr *ContentExtractor) GetMetaDescription(document *goquery.Document) string {
return extr.GetMetaContent(document, "description")
}
// GetMetaKeywords returns the meta keywords set in the source, if the article has them
func (extr *ContentExtractor) GetMetaKeywords(document *goquery.Document) string {
return extr.GetMetaContent(document, "keywords")
}
// GetMetaAuthor returns the meta author set in the source, if the article has one
func (extr *ContentExtractor) GetMetaAuthor(document *goquery.Document) string {
return extr.GetMetaContent(document, "author")
}
// GetMetaContentLocation returns the meta content location set in the source, if the article has one
func (extr *ContentExtractor) GetMetaContentLocation(document *goquery.Document) string {
return extr.GetMetaContent(document, "contentLocation")
}
// GetCanonicalLink returns the meta canonical link set in the source
func (extr *ContentExtractor) GetCanonicalLink(document *goquery.Document) string {
metas := document.Find("link[rel=canonical]")
if metas.Length() > 0 {
meta := metas.First()
href, _ := meta.Attr("href")
href = strings.Trim(href, "\n")
href = strings.Trim(href, " ")
if href != "" {
return href
}
}
return ""
}
// GetDomain extracts the domain from a link
func (extr *ContentExtractor) GetDomain(canonicalLink string) string {
u, err := url.Parse(canonicalLink)
if err == nil {
return u.Host
}
return ""
}
// GetTags returns the tags set in the source, if the article has them
func (extr *ContentExtractor) GetTags(document *goquery.Document) *set.Set {
tags := set.New(set.ThreadSafe).(*set.Set)
selections := document.Find(aRelTagSelector)
selections.Each(func(i int, s *goquery.Selection) {
tags.Add(s.Text())
})
selections = document.Find("a")
selections.Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
for _, part := range aHrefTagSelector {
if strings.Contains(href, part) {
tags.Add(s.Text())
}
}
}
})
return tags
}
// GetPublishDate returns the publication date, if one can be located.
func (extr *ContentExtractor) GetPublishDate(document *goquery.Document) *time.Time {
raw, err := document.Html()
if err != nil {
log.Printf("Error converting document HTML nodes to raw HTML: %s (publish date detection aborted)\n", err)
return nil
}
text, err := html2text.FromString(raw)
if err != nil {
log.Printf("Error converting document HTML to plaintext: %s (publish date detection aborted)\n", err)
return nil
}
text = strings.ToLower(text)
// Simplify months because the dateparse pkg only handles abbreviated.
for k, v := range map[string]string{
"january": "jan",
"march": "mar",
"february": "feb",
"april": "apr",
// "may": "may", // Pointless.
"june": "jun",
"august": "aug",
"september": "sep",
"sept": "sep",
"october": "oct",
"november": "nov",
"december": "dec",
"th,": ",", // Strip day number suffixes.
"rd,": ",",
} {
text = strings.Replace(text, k, v, -1)
}
text = strings.Replace(text, "\n", " ", -1)
text = regexp.MustCompile(" +").ReplaceAllString(text, " ")
tuple1 := strings.Split(text, " ")
var (
expr = regexp.MustCompile("[0-9]")
ts time.Time
found bool
)
for _, n := range []int{3, 4, 5, 2, 6} {
for _, win := range window.Rolling(tuple1, n) {
if !expr.MatchString(strings.Join(win, " ")) {
continue
}
input := strings.Join(win, " ")
ts, err = dateparse.ParseAny(input)
if err == nil && ts.Year() > 0 && ts.Month() > 0 && ts.Day() > 0 {
found = true
break
}
// Try injecting a comma for dateparse.
win[1] = win[1] + ","
input = strings.Join(win, " ")
ts, err = dateparse.ParseAny(input)
if err == nil && ts.Year() > 0 && ts.Month() > 0 && ts.Day() > 0 {
found = true
break
}
}
if found {
break
}
}
if found {
return &ts
}
return nil
}
// GetCleanTextAndLinks parses the main HTML node for text and links
func (extr *ContentExtractor) GetCleanTextAndLinks(topNode *goquery.Selection, lang string) (string, []string) {
outputFormatter := new(outputFormatter)
outputFormatter.config = extr.config
return outputFormatter.getFormattedText(topNode, lang)
}
// CalculateBestNode checks for the HTML node most likely to contain the main content.
//we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
//and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
//also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
func (extr *ContentExtractor) CalculateBestNode(document *goquery.Document) *goquery.Selection {
var topNode *goquery.Selection
nodesToCheck := extr.nodesToCheck(document)
if extr.config.debug {
log.Printf("Nodes to check %d\n", len(nodesToCheck))
}
startingBoost := 1.0
cnt := 0
i := 0
parentNodes := set.New(set.ThreadSafe).(*set.Set)
nodesWithText := list.New()
for _, node := range nodesToCheck {
textNode := node.Text()
ws := extr.config.stopWords.stopWordsCount(extr.config.targetLanguage, textNode)
highLinkDensity := extr.isHighLinkDensity(node)
if ws.stopWordCount > 2 && !highLinkDensity {
nodesWithText.PushBack(node)
}
}
nodesNumber := nodesWithText.Len()
negativeScoring := 0
bottomNegativeScoring := float64(nodesNumber) * 0.25
if extr.config.debug {
log.Printf("About to inspect num of nodes with text %d\n", nodesNumber)
}
for n := nodesWithText.Front(); n != nil; n = n.Next() {
node := n.Value.(*goquery.Selection)
boostScore := 0.0
if extr.isBoostable(node) {
if cnt >= 0 {
boostScore = float64((1.0 / startingBoost) * 50)
startingBoost++
}
}
if nodesNumber > 15 {
if float64(nodesNumber-i) <= bottomNegativeScoring {
booster := bottomNegativeScoring - float64(nodesNumber-i)
boostScore = -math.Pow(booster, 2.0)
negScore := math.Abs(boostScore) + float64(negativeScoring)
if negScore > 40 {
boostScore = 5.0
}
}
}
if extr.config.debug {
log.Printf("Location Boost Score %1.5f on iteration %d id='%s' class='%s'\n", boostScore, i, extr.config.parser.name("id", node), extr.config.parser.name("class", node))
}
textNode := node.Text()
ws := extr.config.stopWords.stopWordsCount(extr.config.targetLanguage, textNode)
upScore := ws.stopWordCount + int(boostScore)
parentNode := node.Parent()
extr.updateScore(parentNode, upScore)
extr.updateNodeCount(parentNode, 1)
if !parentNodes.Has(parentNode) {
parentNodes.Add(parentNode)
}
parentParentNode := parentNode.Parent()
if parentParentNode != nil {
extr.updateNodeCount(parentParentNode, 1)
extr.updateScore(parentParentNode, upScore/2.0)
if !parentNodes.Has(parentParentNode) {
parentNodes.Add(parentParentNode)
}
}
cnt++
i++
}
topNodeScore := 0
parentNodesArray := parentNodes.List()
for _, p := range parentNodesArray {
e := p.(*goquery.Selection)
if extr.config.debug {
log.Printf("ParentNode: score=%s nodeCount=%s id='%s' class='%s'\n", extr.config.parser.name("gravityScore", e), extr.config.parser.name("gravityNodes", e), extr.config.parser.name("id", e), extr.config.parser.name("class", e))
}
score := extr.getScore(e)
if score >= topNodeScore {
topNode = e
topNodeScore = score
}
if topNode == nil {
topNode = e
}
}
return topNode
}
//returns the gravityScore as an integer from this node
func (extr *ContentExtractor) getScore(node *goquery.Selection) int {
return extr.getNodeGravityScore(node)
}
func (extr *ContentExtractor) getNodeGravityScore(node *goquery.Selection) int {
grvScoreString, exists := node.Attr("gravityScore")
if !exists {
return 0
}
grvScore, err := strconv.Atoi(grvScoreString)
if err != nil {
return 0
}
return grvScore
}
//adds a score to the gravityScore Attribute we put on divs
//we'll get the current score then add the score we're passing in to the current
func (extr *ContentExtractor) updateScore(node *goquery.Selection, addToScore int) {
currentScore := 0
var err error
scoreString, _ := node.Attr("gravityScore")
if scoreString != "" {
currentScore, err = strconv.Atoi(scoreString)
if err != nil {
currentScore = 0
}
}
newScore := currentScore + addToScore
extr.config.parser.setAttr(node, "gravityScore", strconv.Itoa(newScore))
}
//stores how many decent nodes are under a parent node
func (extr *ContentExtractor) updateNodeCount(node *goquery.Selection, addToCount int) {
currentScore := 0
var err error
scoreString, _ := node.Attr("gravityNodes")
if scoreString != "" {
currentScore, err = strconv.Atoi(scoreString)
if err != nil {
currentScore = 0
}
}
newScore := currentScore + addToCount
extr.config.parser.setAttr(node, "gravityNodes", strconv.Itoa(newScore))
}
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
//boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
//so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it
func (extr *ContentExtractor) isBoostable(node *goquery.Selection) bool {
stepsAway := 0
next := node.Next()
for next != nil && stepsAway < node.Siblings().Length() {
currentNodeTag := node.Get(0).DataAtom.String()
if currentNodeTag == "p" {
if stepsAway >= 3 {
if extr.config.debug {
log.Println("Next paragraph is too far away, not boosting")
}
return false
}
paraText := node.Text()
ws := extr.config.stopWords.stopWordsCount(extr.config.targetLanguage, paraText)
if ws.stopWordCount > 5 {
if extr.config.debug {
log.Println("We're gonna boost this node, seems content")
}
return true
}
}
stepsAway++
next = next.Next()
}
return false
}
//returns a list of nodes we want to search on like paragraphs and tables
func (extr *ContentExtractor) nodesToCheck(doc *goquery.Document) []*goquery.Selection {
var output []*goquery.Selection
tags := []string{"p", "pre", "td"}
for _, tag := range tags {
selections := doc.Children().Find(tag)
if selections != nil {
selections.Each(func(i int, s *goquery.Selection) {
output = append(output, s)
})
}
}
return output
}
//checks the density of links within a node, is there not much text and most of it contains bad links?
//if so it's no good
func (extr *ContentExtractor) isHighLinkDensity(node *goquery.Selection) bool {
links := node.Find("a")
if links == nil || links.Size() == 0 {
return false
}
text := node.Text()
words := strings.Split(text, " ")
nwords := len(words)
var sb []string
links.Each(func(i int, s *goquery.Selection) {
linkText := s.Text()
sb = append(sb, linkText)
})
linkText := strings.Join(sb, "")
linkWords := strings.Split(linkText, " ")
nlinkWords := len(linkWords)
nlinks := links.Size()
linkDivisor := float64(nlinkWords) / float64(nwords)
score := linkDivisor * float64(nlinks)
if extr.config.debug {
var logText string
if len(node.Text()) >= 51 {
logText = node.Text()[0:50]
} else {
logText = node.Text()
}
log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText)
}
if score > 1.0 {
return true
}
return false
}
func (extr *ContentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool {
subParagraph := selection.Find("p")
subParagraph.Each(func(i int, s *goquery.Selection) {
txt := s.Text()
if len(txt) < 25 {
node := s.Get(0)
parent := node.Parent
parent.RemoveChild(node)
}
})
subParagraph2 := selection.Find("p")
if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" {
return true
}
return false
}
func (extr *ContentExtractor) isNodescoreThresholdMet(node *goquery.Selection, e *goquery.Selection) bool {
topNodeScore := extr.getNodeGravityScore(node)
currentNodeScore := extr.getNodeGravityScore(e)
threasholdScore := float64(topNodeScore) * 0.08
if (float64(currentNodeScore) < threasholdScore) && e.Get(0).DataAtom.String() != "td" {
return false
}
return true
}
//we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
//the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
//of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
//100 then 100 should be our base.
func (extr *ContentExtractor) getSiblingsScore(topNode *goquery.Selection) int {
base := 100000
paragraphNumber := 0
paragraphScore := 0
nodesToCheck := topNode.Find("p")
nodesToCheck.Each(func(i int, s *goquery.Selection) {
textNode := s.Text()
ws := extr.config.stopWords.stopWordsCount(extr.config.targetLanguage, textNode)
highLinkDensity := extr.isHighLinkDensity(s)
if ws.stopWordCount > 2 && !highLinkDensity {
paragraphNumber++
paragraphScore += ws.stopWordCount
}
})
if paragraphNumber > 0 {
base = paragraphScore / paragraphNumber
}
return base
}
func (extr *ContentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
var ps []*goquery.Selection
if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
ps = append(ps, currentSibling)
return ps
}
potentialParagraphs := currentSibling.Find("p")
potentialParagraphs.Each(func(i int, s *goquery.Selection) {
text := s.Text()
if len(text) > 0 {
ws := extr.config.stopWords.stopWordsCount(extr.config.targetLanguage, text)
paragraphScore := ws.stopWordCount
siblingBaselineScore := 0.30
highLinkDensity := extr.isHighLinkDensity(s)
score := siblingBaselineScore * baselinescoreSiblingsPara
if score < float64(paragraphScore) && !highLinkDensity {
node := new(html.Node)
node.Type = html.TextNode
node.Data = text
node.DataAtom = atom.P
nodes := make([]*html.Node, 1)
nodes[0] = node
newSelection := new(goquery.Selection)
newSelection.Nodes = nodes
ps = append(ps, newSelection)
}
}
})
return ps
}
func (extr *ContentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection {
currentSibling := node.Prev()
var b []*goquery.Selection
for currentSibling.Length() != 0 {
b = append(b, currentSibling)
previousSibling := currentSibling.Prev()
currentSibling = previousSibling
}
return b
}
//adds any siblings that may have a decent score to this node
func (extr *ContentExtractor) addSiblings(topNode *goquery.Selection) *goquery.Selection {
if extr.config.debug {
log.Println("Starting to add siblings")
}
baselinescoreSiblingsPara := extr.getSiblingsScore(topNode)
results := extr.walkSiblings(topNode)
for _, currentNode := range results {
ps := extr.getSiblingsContent(currentNode, float64(baselinescoreSiblingsPara))
for _, p := range ps {
nodes := make([]*html.Node, len(topNode.Nodes)+1)
nodes[0] = p.Get(0)
for i, node := range topNode.Nodes {
nodes[i+1] = node
}
topNode.Nodes = nodes
}
}
return topNode
}
//PostCleanup removes any divs that looks like non-content, clusters of links, or paras with no gusto
func (extr *ContentExtractor) PostCleanup(targetNode *goquery.Selection) *goquery.Selection {
if extr.config.debug {
log.Println("Starting cleanup Node")
}
node := extr.addSiblings(targetNode)
children := node.Children()
children.Each(func(i int, s *goquery.Selection) {
tag := s.Get(0).DataAtom.String()
if tag != "p" {
if extr.config.debug {
log.Printf("CLEANUP NODE: %s class: %s\n", extr.config.parser.name("id", s), extr.config.parser.name("class", s))
}
//if extr.isHighLinkDensity(s) || extr.isTableAndNoParaExist(s) || !extr.isNodescoreThresholdMet(node, s) {
if extr.isHighLinkDensity(s) {
extr.config.parser.removeNode(s)
return
}
subParagraph := s.Find("p")
subParagraph.Each(func(j int, e *goquery.Selection) {
if len(e.Text()) < 25 {
extr.config.parser.removeNode(e)
}
})
subParagraph2 := s.Find("p")
if subParagraph2.Length() == 0 && tag != "td" {
if extr.config.debug {
log.Println("Removing node because it doesn't have any paragraphs")
}
extr.config.parser.removeNode(s)
} else {
if extr.config.debug {
log.Println("Not removing TD node")
}
}
return
}
})
return node
}