// SiYuan - Refactor your thinking // Copyright (c) 2020-present, b3log.org // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package model import ( "bytes" "io/fs" "os" "path/filepath" "runtime" "strconv" "strings" "sync" "time" "unicode/utf8" "code.sajari.com/docconv" "github.com/88250/epub" "github.com/88250/go-humanize" "github.com/88250/gulu" "github.com/88250/lute/ast" "github.com/klippa-app/go-pdfium" "github.com/klippa-app/go-pdfium/requests" "github.com/klippa-app/go-pdfium/webassembly" "github.com/siyuan-note/eventbus" "github.com/siyuan-note/filelock" "github.com/siyuan-note/logging" "github.com/siyuan-note/siyuan/kernel/search" "github.com/siyuan-note/siyuan/kernel/sql" "github.com/siyuan-note/siyuan/kernel/task" "github.com/siyuan-note/siyuan/kernel/util" "github.com/xuri/excelize/v2" ) type AssetContent struct { ID string `json:"id"` Name string `json:"name"` Ext string `json:"ext"` Path string `json:"path"` Size int64 `json:"size"` HSize string `json:"hSize"` Updated int64 `json:"updated"` Content string `json:"content"` } func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) { if "" != query && (0 == queryMethod || 1 == queryMethod) { if 0 == queryMethod { query = stringQuery(query) } } if !ast.IsNodeIDPattern(id) { return } table := "asset_contents_fts_case_insensitive" filter := " id = '" + id + "'" if "" != query { filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'" } projections := "id, name, ext, path, size, updated, " + "highlight(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "') AS content" stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1) results := fromSQLAssetContents(&assetContents, 36) if 1 > len(results) { return } ret = results[0] ret.Content = strings.ReplaceAll(ret.Content, "\n", "
") return } // FullTextSearchAssetContent 搜索资源文件内容。 // // method:0:关键字,1:查询语法,2:SQL,3:正则表达式 // orderBy: 0:按相关度降序,1:按相关度升序,2:按更新时间升序,3:按更新时间降序 func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) { query = strings.TrimSpace(query) beforeLen := 36 orderByClause := buildAssetContentOrderBy(orderBy) switch method { case 1: // 查询语法 filter := buildAssetContentTypeFilter(types) ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize) case 2: // SQL ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize) case 3: // 正则表达式 typeFilter := buildAssetContentTypeFilter(types) ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize) default: // 关键字 filter := buildAssetContentTypeFilter(types) ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize) } pageCount = (matchedAssetCount + pageSize - 1) / pageSize if 1 > len(ret) { ret = []*AssetContent{} } return } func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { query = filterQueryInvisibleChars(query) return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize) } func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { query = filterQueryInvisibleChars(query) query = stringQuery(query) return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize) } func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { exp = filterQueryInvisibleChars(exp) fieldFilter := assetContentFieldRegexp(exp) stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter stmt += " " + orderBy stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize) assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit) ret = fromSQLAssetContents(&assetContents, beforeLen) if 1 > len(ret) { ret = []*AssetContent{} } matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter) return } func assetContentFieldRegexp(exp string) string { buf := bytes.Buffer{} buf.WriteString("(name REGEXP '") buf.WriteString(exp) buf.WriteString("' OR content REGEXP '") buf.WriteString(exp) buf.WriteString("')") return buf.String() } func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) { table := "asset_contents_fts_case_insensitive" fieldFilter := assetContentFieldRegexp(exp) stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter result, _ := sql.QueryAssetContentNoLimit(stmt) if 1 > len(result) { return } matchedAssetCount = int(result[0]["assets"].(int64)) return } func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { table := "asset_contents_fts_case_insensitive" projections := "id, name, ext, path, size, updated, " + "snippet(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "', '...', 64) AS content" stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'" stmt += ") AND ext IN " + typeFilter stmt += " " + orderBy stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize) assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize) ret = fromSQLAssetContents(&assetContents, beforeLen) if 1 > len(ret) { ret = []*AssetContent{} } matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter) return } func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) { stmt = filterQueryInvisibleChars(stmt) stmt = strings.TrimSpace(stmt) assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize) ret = fromSQLAssetContents(&assetContents, beforeLen) if 1 > len(ret) { ret = []*AssetContent{} return } stmt = strings.ToLower(stmt) stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ") stmt = removeLimitClause(stmt) result, _ := sql.QueryAssetContentNoLimit(stmt) if 1 > len(ret) { return } matchedAssetCount = int(result[0]["assets"].(int64)) return } func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) { query = filterQueryInvisibleChars(query) table := "asset_contents_fts_case_insensitive" stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'" stmt += ") AND ext IN " + typeFilter result, _ := sql.QueryAssetContentNoLimit(stmt) if 1 > len(result) { return } matchedAssetCount = int(result[0]["assets"].(int64)) return } func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) { ret = []*AssetContent{} for _, assetContent := range *assetContents { ret = append(ret, fromSQLAssetContent(assetContent, beforeLen)) } return } func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent { content := util.EscapeHTML(assetContent.Content) if strings.Contains(content, search.SearchMarkLeft) { content = strings.ReplaceAll(content, search.SearchMarkLeft, "") content = strings.ReplaceAll(content, search.SearchMarkRight, "") } return &AssetContent{ ID: assetContent.ID, Name: assetContent.Name, Ext: assetContent.Ext, Path: assetContent.Path, Size: assetContent.Size, HSize: humanize.BytesCustomCeil(uint64(assetContent.Size), 2), Updated: assetContent.Updated, Content: content, } } func buildAssetContentColumnFilter() string { return "{name content}" } func buildAssetContentTypeFilter(types map[string]bool) string { if 0 == len(types) { return "" } var buf bytes.Buffer buf.WriteString("(") for k, enabled := range types { if !enabled { continue } buf.WriteString("'") buf.WriteString(k) buf.WriteString("',") } if 1 == buf.Len() { buf.WriteString(")") return buf.String() } buf.Truncate(buf.Len() - 1) buf.WriteString(")") return buf.String() } func buildAssetContentOrderBy(orderBy int) string { switch orderBy { case 0: return "ORDER BY rank DESC" case 1: return "ORDER BY rank ASC" case 2: return "ORDER BY updated ASC" case 3: return "ORDER BY updated DESC" default: return "ORDER BY rank DESC" } } var assetContentSearcher = NewAssetsSearcher() func RemoveIndexAssetContent(absPath string) { defer logging.Recover() assetsDir := util.GetDataAssetsAbsPath() p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir)) sql.DeleteAssetContentsByPathQueue(p) } func IndexAssetContent(absPath string) { defer logging.Recover() ext := filepath.Ext(absPath) parser := assetContentSearcher.GetParser(ext) if nil == parser { return } result := parser.Parse(absPath) if nil == result { return } info, err := os.Stat(absPath) if err != nil { logging.LogErrorf("stat [%s] failed: %s", absPath, err) return } assetsDir := util.GetDataAssetsAbsPath() p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir)) assetContents := []*sql.AssetContent{ { ID: ast.NewNodeID(), Name: util.RemoveID(filepath.Base(p)), Ext: ext, Path: p, Size: info.Size(), Updated: info.ModTime().Unix(), Content: result.Content, }, } sql.DeleteAssetContentsByPathQueue(p) sql.IndexAssetContentsQueue(assetContents) } func ReindexAssetContent() { task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent) return } func fullReindexAssetContent() { util.PushMsg(Conf.Language(216), 7*1000) sql.InitAssetContentDatabase(true) assetContentSearcher.FullIndex() return } func init() { subscribeSQLAssetContentEvents() } func subscribeSQLAssetContentEvents() { eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() { ReindexAssetContent() }) } var ( AssetsSearchEnabled = true ) type AssetsSearcher struct { parsers map[string]AssetParser lock *sync.Mutex } func (searcher *AssetsSearcher) GetParser(ext string) AssetParser { searcher.lock.Lock() defer searcher.lock.Unlock() return searcher.parsers[strings.ToLower(ext)] } func (searcher *AssetsSearcher) FullIndex() { defer logging.Recover() assetsDir := util.GetDataAssetsAbsPath() if !gulu.File.IsDir(assetsDir) { return } var results []*AssetParseResult filelock.Walk(assetsDir, func(absPath string, d fs.DirEntry, err error) error { if err != nil { logging.LogErrorf("walk dir [%s] failed: %s", absPath, err) return err } if d.IsDir() { return nil } ext := filepath.Ext(absPath) parser := searcher.GetParser(ext) if nil == parser { return nil } logging.LogInfof("parsing asset content [%s]", absPath) result := parser.Parse(absPath) if nil == result { return nil } info, err := d.Info() if err != nil { logging.LogErrorf("stat file [%s] failed: %s", absPath, err) return nil } result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir)) result.Size = info.Size() result.Updated = info.ModTime().Unix() results = append(results, result) return nil }) var assetContents []*sql.AssetContent for _, result := range results { assetContents = append(assetContents, &sql.AssetContent{ ID: ast.NewNodeID(), Name: util.RemoveID(filepath.Base(result.Path)), Ext: strings.ToLower(filepath.Ext(result.Path)), Path: result.Path, Size: result.Size, Updated: result.Updated, Content: result.Content, }) } sql.IndexAssetContentsQueue(assetContents) } func NewAssetsSearcher() *AssetsSearcher { txtAssetParser := &TxtAssetParser{} return &AssetsSearcher{ parsers: map[string]AssetParser{ ".txt": txtAssetParser, ".md": txtAssetParser, ".markdown": txtAssetParser, ".json": txtAssetParser, ".log": txtAssetParser, ".sql": txtAssetParser, ".html": txtAssetParser, ".xml": txtAssetParser, ".java": txtAssetParser, ".h": txtAssetParser, ".c": txtAssetParser, ".cpp": txtAssetParser, ".go": txtAssetParser, ".rs": txtAssetParser, ".swift": txtAssetParser, ".kt": txtAssetParser, ".py": txtAssetParser, ".php": txtAssetParser, ".js": txtAssetParser, ".css": txtAssetParser, ".ts": txtAssetParser, ".sh": txtAssetParser, ".bat": txtAssetParser, ".cmd": txtAssetParser, ".ini": txtAssetParser, ".yaml": txtAssetParser, ".rst": txtAssetParser, ".adoc": txtAssetParser, ".textile": txtAssetParser, ".opml": txtAssetParser, ".org": txtAssetParser, ".wiki": txtAssetParser, ".docx": &DocxAssetParser{}, ".pptx": &PptxAssetParser{}, ".xlsx": &XlsxAssetParser{}, ".pdf": &PdfAssetParser{}, ".epub": &EpubAssetParser{}, }, lock: &sync.Mutex{}, } } const ( TxtAssetContentMaxSize = 1024 * 1024 * 4 PDFAssetContentMaxPage = 1024 ) var ( PDFAssetContentMaxSize uint64 = 1024 * 1024 * 128 ) type AssetParseResult struct { Path string Size int64 Updated int64 Content string } type AssetParser interface { Parse(absPath string) *AssetParseResult } type TxtAssetParser struct { } func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { info, err := os.Stat(absPath) if err != nil { logging.LogErrorf("stat file [%s] failed: %s", absPath, err) return } if TxtAssetContentMaxSize < info.Size() { logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.BytesCustomCeil(uint64(info.Size()), 2)) return } tmp := copyTempAsset(absPath) if "" == tmp { return } defer os.RemoveAll(tmp) data, err := os.ReadFile(tmp) if err != nil { logging.LogErrorf("read file [%s] failed: %s", absPath, err) return } if !utf8.Valid(data) { // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052 logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath) return } content := string(data) ret = &AssetParseResult{ Content: content, } return } func normalizeNonTxtAssetContent(content string) (ret string) { ret = strings.Join(strings.Fields(content), " ") return } func copyTempAsset(absPath string) (ret string) { dir := filepath.Join(util.TempDir, "convert", "asset_content") if err := os.MkdirAll(dir, 0755); err != nil { logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err) return } baseName := filepath.Base(absPath) if strings.HasPrefix(baseName, "~") { return } filelock.Lock(absPath) defer filelock.Unlock(absPath) ext := filepath.Ext(absPath) ret = filepath.Join(dir, gulu.Rand.String(7)+ext) if err := gulu.File.Copy(absPath, ret); err != nil { logging.LogErrorf("copy [src=%s, dest=%s] failed: %s", absPath, ret, err) return } return } type DocxAssetParser struct { } func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) { if !strings.HasSuffix(strings.ToLower(absPath), ".docx") { return } if !gulu.File.IsExist(absPath) { return } tmp := copyTempAsset(absPath) if "" == tmp { return } defer os.RemoveAll(tmp) f, err := os.Open(tmp) if err != nil { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } defer f.Close() data, _, err := docconv.ConvertDocx(f) if err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } var content = normalizeNonTxtAssetContent(data) ret = &AssetParseResult{ Content: content, } return } type PptxAssetParser struct { } func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) { if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") { return } if !gulu.File.IsExist(absPath) { return } tmp := copyTempAsset(absPath) if "" == tmp { return } defer os.RemoveAll(tmp) f, err := os.Open(tmp) if err != nil { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } defer f.Close() data, _, err := docconv.ConvertPptx(f) if err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } var content = normalizeNonTxtAssetContent(data) ret = &AssetParseResult{ Content: content, } return } type XlsxAssetParser struct { } func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) { if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") { return } if !gulu.File.IsExist(absPath) { return } tmp := copyTempAsset(absPath) if "" == tmp { return } defer os.RemoveAll(tmp) x, err := excelize.OpenFile(tmp) if err != nil { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } defer x.Close() buf := bytes.Buffer{} sheetMap := x.GetSheetMap() for _, sheetName := range sheetMap { rows, getErr := x.GetRows(sheetName) if nil != getErr { logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr) return } for _, row := range rows { for _, colCell := range row { buf.WriteString(colCell + " ") } } } var content = normalizeNonTxtAssetContent(buf.String()) ret = &AssetParseResult{ Content: content, } return } // PdfAssetParser parser factory product type PdfAssetParser struct { } // pdfPage struct defines a worker job for text extraction type pdfPage struct { pageNo int // page number for text extraction data *[]byte // pointer to PDF document data } // pdfTextResult struct defines the extracted PDF text result type pdfTextResult struct { pageNo int // page number of PDF document text string // text of converted page err error // processing error } // getTextPageWorker will extract the text from a given PDF page and return its result func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) { defer instance.Close() for pd := range page { doc, err := instance.OpenDocument(&requests.OpenDocument{ File: pd.data, }) if err != nil { instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) result <- &pdfTextResult{ pageNo: pd.pageNo, err: err, } continue } req := &requests.GetPageText{ Page: requests.Page{ ByIndex: &requests.PageByIndex{ Document: doc.Document, Index: pd.pageNo, }, }, } res, err := instance.GetPageText(req) if err != nil { instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) result <- &pdfTextResult{ pageNo: pd.pageNo, err: err, } continue } instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) result <- &pdfTextResult{ pageNo: pd.pageNo, text: res.Text, err: nil, } } } // Parse will parse a PDF document using PDFium webassembly module using a worker pool func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { if util.ContainerIOS == util.Container || util.ContainerAndroid == util.Container || util.ContainerHarmony == util.Container { // PDF asset content searching is not supported on mobile platforms return } now := time.Now() if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") { return } if !gulu.File.IsExist(absPath) { return } tmp := copyTempAsset(absPath) if "" == tmp { return } defer os.RemoveAll(tmp) // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible pdfData, err := os.ReadFile(tmp) if err != nil { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } // initialize go-pdfium with number of available cores // we fire up the complete worker pool for maximum performance cores := runtime.NumCPU() if 4 < cores { cores = 4 // Limit memory usage } pool, err := webassembly.Init(webassembly.Config{ MinIdle: cores, MaxIdle: cores, MaxTotal: cores, }) if err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } defer pool.Close() // first get the number of PDF pages to convert into text instance, err := pool.GetInstance(time.Second * 30) if err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } doc, err := instance.OpenDocument(&requests.OpenDocument{ File: &pdfData, }) if err != nil { instance.Close() logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) if err != nil { instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) instance.Close() logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } instance.Close() if PDFAssetContentMaxPage < pc.PageCount { // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount) return } if maxSizeVal := os.Getenv("SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE"); "" != maxSizeVal { if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr { if maxSize != PDFAssetContentMaxSize { PDFAssetContentMaxSize = maxSize logging.LogInfof("set PDF asset content index max size to [%s]", humanize.BytesCustomCeil(maxSize, 2)) } } else { logging.LogWarnf("invalid env [SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE]: [%s], parsing failed: %s", maxSizeVal, parseErr) } } if PDFAssetContentMaxSize < uint64(len(pdfData)) { // PDF files larger than 128MB are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9500 logging.LogWarnf("ignore large PDF asset [%s] with [%s]", absPath, humanize.BytesCustomCeil(uint64(len(pdfData)), 2)) return } // next setup worker pool for processing PDF pages pages := make(chan *pdfPage, pc.PageCount) results := make(chan *pdfTextResult, pc.PageCount) for i := 0; i < cores; i++ { inst, err := pool.GetInstance(time.Second * 30) if err != nil { close(pages) close(results) logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } go parser.getTextPageWorker(i, inst, pages, results) } // now split pages and let them process by worker pool for p := 0; p < pc.PageCount; p++ { pages <- &pdfPage{ pageNo: p, data: &pdfData, } } close(pages) // finally fetch the PDF page text results // Note: some workers will process pages faster than other workers depending on the page contents // the order of returned PDF text pages is random and must be sorted using the pageNo index pageText := make([]string, pc.PageCount) for p := 0; p < pc.PageCount; p++ { res := <-results pageText[res.pageNo] = res.text if nil != res.err { logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, res.err) } } close(results) if 128 < pc.PageCount { logging.LogInfof("convert [%s] PDF with [%d] pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now)) } // loop through ordered PDF text pages and join content for asset parse DB result contentBuilder := bytes.Buffer{} for _, pt := range pageText { contentBuilder.WriteString(" " + normalizeNonTxtAssetContent(pt)) } ret = &AssetParseResult{ Content: contentBuilder.String(), } return } type EpubAssetParser struct { } func (parser *EpubAssetParser) Parse(absPath string) (ret *AssetParseResult) { if !strings.HasSuffix(strings.ToLower(absPath), ".epub") { return } if !gulu.File.IsExist(absPath) { return } tmp := copyTempAsset(absPath) if "" == tmp { return } defer os.RemoveAll(tmp) f, err := os.Open(tmp) if err != nil { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } defer f.Close() buf := bytes.Buffer{} if err = epub.ToTxt(tmp, &buf); err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } content := normalizeNonTxtAssetContent(buf.String()) ret = &AssetParseResult{ Content: content, } return }