// SiYuan - Refactor your thinking
// Copyright (c) 2020-present, b3log.org
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
package model
import (
"bytes"
"io/fs"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"code.sajari.com/docconv"
"github.com/88250/epub"
"github.com/88250/go-humanize"
"github.com/88250/gulu"
"github.com/88250/lute/ast"
"github.com/klippa-app/go-pdfium"
"github.com/klippa-app/go-pdfium/requests"
"github.com/klippa-app/go-pdfium/webassembly"
"github.com/siyuan-note/eventbus"
"github.com/siyuan-note/filelock"
"github.com/siyuan-note/logging"
"github.com/siyuan-note/siyuan/kernel/search"
"github.com/siyuan-note/siyuan/kernel/sql"
"github.com/siyuan-note/siyuan/kernel/task"
"github.com/siyuan-note/siyuan/kernel/util"
"github.com/xuri/excelize/v2"
)
type AssetContent struct {
ID string `json:"id"`
Name string `json:"name"`
Ext string `json:"ext"`
Path string `json:"path"`
Size int64 `json:"size"`
HSize string `json:"hSize"`
Updated int64 `json:"updated"`
Content string `json:"content"`
}
func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) {
if "" != query && (0 == queryMethod || 1 == queryMethod) {
if 0 == queryMethod {
query = stringQuery(query)
}
}
if !ast.IsNodeIDPattern(id) {
return
}
table := "asset_contents_fts_case_insensitive"
filter := " id = '" + id + "'"
if "" != query {
filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
}
projections := "id, name, ext, path, size, updated, " +
"highlight(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "') AS content"
stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter
assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1)
results := fromSQLAssetContents(&assetContents, 36)
if 1 > len(results) {
return
}
ret = results[0]
ret.Content = strings.ReplaceAll(ret.Content, "\n", "
")
return
}
// FullTextSearchAssetContent 搜索资源文件内容。
//
// method:0:关键字,1:查询语法,2:SQL,3:正则表达式
// orderBy: 0:按相关度降序,1:按相关度升序,2:按更新时间升序,3:按更新时间降序
func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) {
query = strings.TrimSpace(query)
beforeLen := 36
orderByClause := buildAssetContentOrderBy(orderBy)
switch method {
case 1: // 查询语法
filter := buildAssetContentTypeFilter(types)
ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize)
case 2: // SQL
ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize)
case 3: // 正则表达式
typeFilter := buildAssetContentTypeFilter(types)
ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize)
default: // 关键字
filter := buildAssetContentTypeFilter(types)
ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize)
}
pageCount = (matchedAssetCount + pageSize - 1) / pageSize
if 1 > len(ret) {
ret = []*AssetContent{}
}
return
}
func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
query = filterQueryInvisibleChars(query)
return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
}
func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
query = filterQueryInvisibleChars(query)
query = stringQuery(query)
return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
}
func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
exp = filterQueryInvisibleChars(exp)
fieldFilter := assetContentFieldRegexp(exp)
stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter
stmt += " " + orderBy
stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit)
ret = fromSQLAssetContents(&assetContents, beforeLen)
if 1 > len(ret) {
ret = []*AssetContent{}
}
matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter)
return
}
func assetContentFieldRegexp(exp string) string {
buf := bytes.Buffer{}
buf.WriteString("(name REGEXP '")
buf.WriteString(exp)
buf.WriteString("' OR content REGEXP '")
buf.WriteString(exp)
buf.WriteString("')")
return buf.String()
}
func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) {
table := "asset_contents_fts_case_insensitive"
fieldFilter := assetContentFieldRegexp(exp)
stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter
result, _ := sql.QueryAssetContentNoLimit(stmt)
if 1 > len(result) {
return
}
matchedAssetCount = int(result[0]["assets"].(int64))
return
}
func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
table := "asset_contents_fts_case_insensitive"
projections := "id, name, ext, path, size, updated, " +
"snippet(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "', '...', 64) AS content"
stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
stmt += ") AND ext IN " + typeFilter
stmt += " " + orderBy
stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
ret = fromSQLAssetContents(&assetContents, beforeLen)
if 1 > len(ret) {
ret = []*AssetContent{}
}
matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter)
return
}
func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
stmt = filterQueryInvisibleChars(stmt)
stmt = strings.TrimSpace(stmt)
assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
ret = fromSQLAssetContents(&assetContents, beforeLen)
if 1 > len(ret) {
ret = []*AssetContent{}
return
}
stmt = strings.ToLower(stmt)
stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ")
stmt = removeLimitClause(stmt)
result, _ := sql.QueryAssetContentNoLimit(stmt)
if 1 > len(ret) {
return
}
matchedAssetCount = int(result[0]["assets"].(int64))
return
}
func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) {
query = filterQueryInvisibleChars(query)
table := "asset_contents_fts_case_insensitive"
stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
stmt += ") AND ext IN " + typeFilter
result, _ := sql.QueryAssetContentNoLimit(stmt)
if 1 > len(result) {
return
}
matchedAssetCount = int(result[0]["assets"].(int64))
return
}
func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) {
ret = []*AssetContent{}
for _, assetContent := range *assetContents {
ret = append(ret, fromSQLAssetContent(assetContent, beforeLen))
}
return
}
func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent {
content := util.EscapeHTML(assetContent.Content)
if strings.Contains(content, search.SearchMarkLeft) {
content = strings.ReplaceAll(content, search.SearchMarkLeft, "")
content = strings.ReplaceAll(content, search.SearchMarkRight, "")
}
return &AssetContent{
ID: assetContent.ID,
Name: assetContent.Name,
Ext: assetContent.Ext,
Path: assetContent.Path,
Size: assetContent.Size,
HSize: humanize.BytesCustomCeil(uint64(assetContent.Size), 2),
Updated: assetContent.Updated,
Content: content,
}
}
func buildAssetContentColumnFilter() string {
return "{name content}"
}
func buildAssetContentTypeFilter(types map[string]bool) string {
if 0 == len(types) {
return ""
}
var buf bytes.Buffer
buf.WriteString("(")
for k, enabled := range types {
if !enabled {
continue
}
buf.WriteString("'")
buf.WriteString(k)
buf.WriteString("',")
}
if 1 == buf.Len() {
buf.WriteString(")")
return buf.String()
}
buf.Truncate(buf.Len() - 1)
buf.WriteString(")")
return buf.String()
}
func buildAssetContentOrderBy(orderBy int) string {
switch orderBy {
case 0:
return "ORDER BY rank DESC"
case 1:
return "ORDER BY rank ASC"
case 2:
return "ORDER BY updated ASC"
case 3:
return "ORDER BY updated DESC"
default:
return "ORDER BY rank DESC"
}
}
var assetContentSearcher = NewAssetsSearcher()
func RemoveIndexAssetContent(absPath string) {
defer logging.Recover()
assetsDir := util.GetDataAssetsAbsPath()
p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
sql.DeleteAssetContentsByPathQueue(p)
}
func IndexAssetContent(absPath string) {
defer logging.Recover()
ext := filepath.Ext(absPath)
parser := assetContentSearcher.GetParser(ext)
if nil == parser {
return
}
result := parser.Parse(absPath)
if nil == result {
return
}
info, err := os.Stat(absPath)
if err != nil {
logging.LogErrorf("stat [%s] failed: %s", absPath, err)
return
}
assetsDir := util.GetDataAssetsAbsPath()
p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
assetContents := []*sql.AssetContent{
{
ID: ast.NewNodeID(),
Name: util.RemoveID(filepath.Base(p)),
Ext: ext,
Path: p,
Size: info.Size(),
Updated: info.ModTime().Unix(),
Content: result.Content,
},
}
sql.DeleteAssetContentsByPathQueue(p)
sql.IndexAssetContentsQueue(assetContents)
}
func ReindexAssetContent() {
task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent)
return
}
func fullReindexAssetContent() {
util.PushMsg(Conf.Language(216), 7*1000)
sql.InitAssetContentDatabase(true)
assetContentSearcher.FullIndex()
return
}
func init() {
subscribeSQLAssetContentEvents()
}
func subscribeSQLAssetContentEvents() {
eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() {
ReindexAssetContent()
})
}
var (
AssetsSearchEnabled = true
)
type AssetsSearcher struct {
parsers map[string]AssetParser
lock *sync.Mutex
}
func (searcher *AssetsSearcher) GetParser(ext string) AssetParser {
searcher.lock.Lock()
defer searcher.lock.Unlock()
return searcher.parsers[strings.ToLower(ext)]
}
func (searcher *AssetsSearcher) FullIndex() {
defer logging.Recover()
assetsDir := util.GetDataAssetsAbsPath()
if !gulu.File.IsDir(assetsDir) {
return
}
var results []*AssetParseResult
filelock.Walk(assetsDir, func(absPath string, d fs.DirEntry, err error) error {
if err != nil {
logging.LogErrorf("walk dir [%s] failed: %s", absPath, err)
return err
}
if d.IsDir() {
return nil
}
ext := filepath.Ext(absPath)
parser := searcher.GetParser(ext)
if nil == parser {
return nil
}
logging.LogInfof("parsing asset content [%s]", absPath)
result := parser.Parse(absPath)
if nil == result {
return nil
}
info, err := d.Info()
if err != nil {
logging.LogErrorf("stat file [%s] failed: %s", absPath, err)
return nil
}
result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
result.Size = info.Size()
result.Updated = info.ModTime().Unix()
results = append(results, result)
return nil
})
var assetContents []*sql.AssetContent
for _, result := range results {
assetContents = append(assetContents, &sql.AssetContent{
ID: ast.NewNodeID(),
Name: util.RemoveID(filepath.Base(result.Path)),
Ext: strings.ToLower(filepath.Ext(result.Path)),
Path: result.Path,
Size: result.Size,
Updated: result.Updated,
Content: result.Content,
})
}
sql.IndexAssetContentsQueue(assetContents)
}
func NewAssetsSearcher() *AssetsSearcher {
txtAssetParser := &TxtAssetParser{}
return &AssetsSearcher{
parsers: map[string]AssetParser{
".txt": txtAssetParser,
".md": txtAssetParser,
".markdown": txtAssetParser,
".json": txtAssetParser,
".log": txtAssetParser,
".sql": txtAssetParser,
".html": txtAssetParser,
".xml": txtAssetParser,
".java": txtAssetParser,
".h": txtAssetParser,
".c": txtAssetParser,
".cpp": txtAssetParser,
".go": txtAssetParser,
".rs": txtAssetParser,
".swift": txtAssetParser,
".kt": txtAssetParser,
".py": txtAssetParser,
".php": txtAssetParser,
".js": txtAssetParser,
".css": txtAssetParser,
".ts": txtAssetParser,
".sh": txtAssetParser,
".bat": txtAssetParser,
".cmd": txtAssetParser,
".ini": txtAssetParser,
".yaml": txtAssetParser,
".rst": txtAssetParser,
".adoc": txtAssetParser,
".textile": txtAssetParser,
".opml": txtAssetParser,
".org": txtAssetParser,
".wiki": txtAssetParser,
".docx": &DocxAssetParser{},
".pptx": &PptxAssetParser{},
".xlsx": &XlsxAssetParser{},
".pdf": &PdfAssetParser{},
".epub": &EpubAssetParser{},
},
lock: &sync.Mutex{},
}
}
const (
TxtAssetContentMaxSize = 1024 * 1024 * 4
PDFAssetContentMaxPage = 1024
)
var (
PDFAssetContentMaxSize uint64 = 1024 * 1024 * 128
)
type AssetParseResult struct {
Path string
Size int64
Updated int64
Content string
}
type AssetParser interface {
Parse(absPath string) *AssetParseResult
}
type TxtAssetParser struct {
}
func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
info, err := os.Stat(absPath)
if err != nil {
logging.LogErrorf("stat file [%s] failed: %s", absPath, err)
return
}
if TxtAssetContentMaxSize < info.Size() {
logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.BytesCustomCeil(uint64(info.Size()), 2))
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
data, err := os.ReadFile(tmp)
if err != nil {
logging.LogErrorf("read file [%s] failed: %s", absPath, err)
return
}
if !utf8.Valid(data) {
// Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath)
return
}
content := string(data)
ret = &AssetParseResult{
Content: content,
}
return
}
func normalizeNonTxtAssetContent(content string) (ret string) {
ret = strings.Join(strings.Fields(content), " ")
return
}
func copyTempAsset(absPath string) (ret string) {
dir := filepath.Join(util.TempDir, "convert", "asset_content")
if err := os.MkdirAll(dir, 0755); err != nil {
logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err)
return
}
baseName := filepath.Base(absPath)
if strings.HasPrefix(baseName, "~") {
return
}
filelock.Lock(absPath)
defer filelock.Unlock(absPath)
ext := filepath.Ext(absPath)
ret = filepath.Join(dir, gulu.Rand.String(7)+ext)
if err := gulu.File.Copy(absPath, ret); err != nil {
logging.LogErrorf("copy [src=%s, dest=%s] failed: %s", absPath, ret, err)
return
}
return
}
type DocxAssetParser struct {
}
func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !strings.HasSuffix(strings.ToLower(absPath), ".docx") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
f, err := os.Open(tmp)
if err != nil {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer f.Close()
data, _, err := docconv.ConvertDocx(f)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
var content = normalizeNonTxtAssetContent(data)
ret = &AssetParseResult{
Content: content,
}
return
}
type PptxAssetParser struct {
}
func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
f, err := os.Open(tmp)
if err != nil {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer f.Close()
data, _, err := docconv.ConvertPptx(f)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
var content = normalizeNonTxtAssetContent(data)
ret = &AssetParseResult{
Content: content,
}
return
}
type XlsxAssetParser struct {
}
func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
x, err := excelize.OpenFile(tmp)
if err != nil {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer x.Close()
buf := bytes.Buffer{}
sheetMap := x.GetSheetMap()
for _, sheetName := range sheetMap {
rows, getErr := x.GetRows(sheetName)
if nil != getErr {
logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr)
return
}
for _, row := range rows {
for _, colCell := range row {
buf.WriteString(colCell + " ")
}
}
}
var content = normalizeNonTxtAssetContent(buf.String())
ret = &AssetParseResult{
Content: content,
}
return
}
// PdfAssetParser parser factory product
type PdfAssetParser struct {
}
// pdfPage struct defines a worker job for text extraction
type pdfPage struct {
pageNo int // page number for text extraction
data *[]byte // pointer to PDF document data
}
// pdfTextResult struct defines the extracted PDF text result
type pdfTextResult struct {
pageNo int // page number of PDF document
text string // text of converted page
err error // processing error
}
// getTextPageWorker will extract the text from a given PDF page and return its result
func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) {
defer instance.Close()
for pd := range page {
doc, err := instance.OpenDocument(&requests.OpenDocument{
File: pd.data,
})
if err != nil {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
result <- &pdfTextResult{
pageNo: pd.pageNo,
err: err,
}
continue
}
req := &requests.GetPageText{
Page: requests.Page{
ByIndex: &requests.PageByIndex{
Document: doc.Document,
Index: pd.pageNo,
},
},
}
res, err := instance.GetPageText(req)
if err != nil {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
result <- &pdfTextResult{
pageNo: pd.pageNo,
err: err,
}
continue
}
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
result <- &pdfTextResult{
pageNo: pd.pageNo,
text: res.Text,
err: nil,
}
}
}
// Parse will parse a PDF document using PDFium webassembly module using a worker pool
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if util.ContainerIOS == util.Container || util.ContainerAndroid == util.Container || util.ContainerHarmony == util.Container {
// PDF asset content searching is not supported on mobile platforms
return
}
now := time.Now()
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
// PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible
pdfData, err := os.ReadFile(tmp)
if err != nil {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
// initialize go-pdfium with number of available cores
// we fire up the complete worker pool for maximum performance
cores := runtime.NumCPU()
if 4 < cores {
cores = 4 // Limit memory usage
}
pool, err := webassembly.Init(webassembly.Config{
MinIdle: cores,
MaxIdle: cores,
MaxTotal: cores,
})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer pool.Close()
// first get the number of PDF pages to convert into text
instance, err := pool.GetInstance(time.Second * 30)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
doc, err := instance.OpenDocument(&requests.OpenDocument{
File: &pdfData,
})
if err != nil {
instance.Close()
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
if err != nil {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
instance.Close()
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
instance.Close()
if PDFAssetContentMaxPage < pc.PageCount {
// PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
return
}
if maxSizeVal := os.Getenv("SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE"); "" != maxSizeVal {
if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr {
if maxSize != PDFAssetContentMaxSize {
PDFAssetContentMaxSize = maxSize
logging.LogInfof("set PDF asset content index max size to [%s]", humanize.BytesCustomCeil(maxSize, 2))
}
} else {
logging.LogWarnf("invalid env [SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE]: [%s], parsing failed: %s", maxSizeVal, parseErr)
}
}
if PDFAssetContentMaxSize < uint64(len(pdfData)) {
// PDF files larger than 128MB are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9500
logging.LogWarnf("ignore large PDF asset [%s] with [%s]", absPath, humanize.BytesCustomCeil(uint64(len(pdfData)), 2))
return
}
// next setup worker pool for processing PDF pages
pages := make(chan *pdfPage, pc.PageCount)
results := make(chan *pdfTextResult, pc.PageCount)
for i := 0; i < cores; i++ {
inst, err := pool.GetInstance(time.Second * 30)
if err != nil {
close(pages)
close(results)
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
go parser.getTextPageWorker(i, inst, pages, results)
}
// now split pages and let them process by worker pool
for p := 0; p < pc.PageCount; p++ {
pages <- &pdfPage{
pageNo: p,
data: &pdfData,
}
}
close(pages)
// finally fetch the PDF page text results
// Note: some workers will process pages faster than other workers depending on the page contents
// the order of returned PDF text pages is random and must be sorted using the pageNo index
pageText := make([]string, pc.PageCount)
for p := 0; p < pc.PageCount; p++ {
res := <-results
pageText[res.pageNo] = res.text
if nil != res.err {
logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, res.err)
}
}
close(results)
if 128 < pc.PageCount {
logging.LogInfof("convert [%s] PDF with [%d] pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
}
// loop through ordered PDF text pages and join content for asset parse DB result
contentBuilder := bytes.Buffer{}
for _, pt := range pageText {
contentBuilder.WriteString(" " + normalizeNonTxtAssetContent(pt))
}
ret = &AssetParseResult{
Content: contentBuilder.String(),
}
return
}
type EpubAssetParser struct {
}
func (parser *EpubAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !strings.HasSuffix(strings.ToLower(absPath), ".epub") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
f, err := os.Open(tmp)
if err != nil {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer f.Close()
buf := bytes.Buffer{}
if err = epub.ToTxt(tmp, &buf); err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
content := normalizeNonTxtAssetContent(buf.String())
ret = &AssetParseResult{
Content: content,
}
return
}