package justext import ( "errors" "fmt" "github.com/levigross/exp-html" "io" "io/ioutil" "strings" ) type Reader struct { LengthLow int LengthHigh int Stoplist map[string]bool StopwordsLow float64 StopwordsHigh float64 MaxLinkDensity float64 MaxHeadingDistance int NoHeadings bool r io.Reader } func NewReader(r io.Reader) *Reader { return &Reader{ LengthLow: 70, LengthHigh: 200, StopwordsLow: 0.30, StopwordsHigh: 0.32, MaxLinkDensity: 0.2, MaxHeadingDistance: 200, NoHeadings: false, r: r, } } func (r *Reader) ReadAll() ([]*Paragraph, error) { in, err := ioutil.ReadAll(r.r) if err != nil { return nil, err } root, err := preprocess(string(in), "utf-8", "utf-8", "errors") if err != nil { return nil, err } if root == nil { return nil, errors.New("Preprocess has resulted in nil") } htmlSource := nodesToString(root) if len(htmlSource) == 0 { return nil, errors.New("MAIN: perprocess has returned an empty string") } p, err := paragraphObjectModel(htmlSource) if err != nil { return nil, err } if p == nil { return nil, errors.New("MAIN: P is nil") } classifyParagraphs(p, r.Stoplist, r.LengthLow, r.LengthHigh, r.StopwordsLow, r.StopwordsHigh, r.MaxLinkDensity, r.NoHeadings) reviseParagraphClassification(p, r.MaxHeadingDistance) return p, nil } func dumpNodes(n *html.Node, tab int, exploreChildNodes bool) string { var childNodes string = "" if exploreChildNodes == true { if len(n.Child) > 0 { for _, c := range n.Child { childNodes = fmt.Sprintf("%s%s\n", childNodes, dumpNodes(c, tab+1, true)) } } } var t string switch n.Type { case html.ErrorNode: t = "Err" case html.TextNode: t = "T" case html.DocumentNode: t = "D" case html.ElementNode: t = "E" case html.CommentNode: t = "C" case html.DoctypeNode: t = "Dt" } tabStr := strings.Repeat(" ", tab) return fmt.Sprintf("%s%s:%s\n%s", tabStr, t, strings.TrimSpace(strings.Replace(n.Data, "\n", "", -1)), childNodes) }