mattermost-community-enterp.../vendor/code.sajari.com/docconv/v2/pdf_ocr.go
Claude ec1f89217a Merge: Complete Mattermost Server with Community Enterprise
Full Mattermost server source with integrated Community Enterprise features.
Includes vendor directory for offline/air-gapped builds.

Structure:
- enterprise-impl/: Enterprise feature implementations
- enterprise-community/: Init files that register implementations
- enterprise/: Bridge imports (community_imports.go)
- vendor/: All dependencies for offline builds

Build (online):
  go build ./cmd/mattermost

Build (offline/air-gapped):
  go build -mod=vendor ./cmd/mattermost

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 23:59:07 +09:00

177 lines
3.4 KiB
Go

//go:build ocr
package docconv
import (
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"sync"
)
var (
exts = []string{".jpg", ".tif", ".tiff", ".png", ".pbm"}
)
func compareExt(ext string, exts []string) bool {
for _, e := range exts {
if ext == e {
return true
}
}
return false
}
func ConvertPDFImages(path string) (BodyResult, error) {
bodyResult := BodyResult{}
tmp, err := os.MkdirTemp(os.TempDir(), "tmp-imgs-")
if err != nil {
bodyResult.err = err
return bodyResult, err
}
tmpDir := fmt.Sprintf("%s/", tmp)
defer func() {
_ = os.RemoveAll(tmpDir) // ignore error
}()
_, err = exec.Command("pdfimages", "-j", path, tmpDir).Output()
if err != nil {
return bodyResult, err
}
filePaths := []string{}
walkFunc := func(path string, info os.FileInfo, err error) error {
path, err = filepath.Abs(path)
if err != nil {
return err
}
if compareExt(filepath.Ext(path), exts) {
filePaths = append(filePaths, path)
}
return nil
}
filepath.Walk(tmpDir, walkFunc)
fileLength := len(filePaths)
if fileLength < 1 {
return bodyResult, nil
}
var wg sync.WaitGroup
data := make(chan string, fileLength)
wg.Add(fileLength)
for _, p := range filePaths {
go func(pathFile string) {
defer wg.Done()
f, err := os.Open(pathFile)
if err != nil {
return
}
defer f.Close()
out, _, err := ConvertImage(f)
if err != nil {
return
}
data <- out
}(p)
}
wg.Wait()
close(data)
for str := range data {
bodyResult.body += str + " "
}
return bodyResult, nil
}
// PdfHasImage verify if `path` (PDF) has images
func PDFHasImage(path string) (bool, error) {
cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq"
out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, shellEscape(path))).CombinedOutput()
if err != nil {
return false, err
}
if string(out) == "" {
return true, nil
}
return false, nil
}
func ConvertPDF(r io.Reader) (string, map[string]string, error) {
f, err := NewLocalFile(r)
if err != nil {
return "", nil, fmt.Errorf("error creating local file: %v", err)
}
defer f.Done()
bodyResult, metaResult, textConvertErr := ConvertPDFText(f.Name())
if textConvertErr != nil {
return "", nil, textConvertErr
}
if bodyResult.err != nil {
return "", nil, bodyResult.err
}
if metaResult.err != nil {
return "", nil, metaResult.err
}
hasImage, err := PDFHasImage(f.Name())
if err != nil {
return "", nil, fmt.Errorf("could not check if PDF has image: %w", err)
}
if !hasImage {
return bodyResult.body, metaResult.meta, nil
}
imageConvertResult, imageConvertErr := ConvertPDFImages(f.Name())
if imageConvertErr != nil {
return bodyResult.body, metaResult.meta, nil // ignore error, return what we have
}
if imageConvertResult.err != nil {
return bodyResult.body, metaResult.meta, nil // ignore error, return what we have
}
fullBody := strings.Join([]string{bodyResult.body, imageConvertResult.body}, " ")
return fullBody, metaResult.meta, nil
}
var shellEscapePattern *regexp.Regexp
func init() {
shellEscapePattern = regexp.MustCompile(`[^\w@%+=:,./-]`)
}
// shellEscape returns a shell-escaped version of the string s. The returned value
// is a string that can safely be used as one token in a shell command line.
func shellEscape(s string) string {
if len(s) == 0 {
return "''"
}
if shellEscapePattern.MatchString(s) {
return "'" + strings.Replace(s, "'", "'\"'\"'", -1) + "'"
}
return s
}