diff --git a/src/utils/classes.ts b/src/utils/classes.ts index 3888a57..a9919b7 100755 --- a/src/utils/classes.ts +++ b/src/utils/classes.ts @@ -287,14 +287,16 @@ function createQuestion( async function recognizeQuestionImage(question: Question): Promise { const base64Data = question.data.base64 if (Array.isArray(base64Data) && base64Data.length) { - try { - const res: string[] = [] - for (let i = 0; i < base64Data.length; i++) { - const base64 = base64Data[i] - const text = await recognizeTextFromBase64(base64) + const res: string[] = [] + for (let i = 0; i < base64Data.length; i++) { + const base64 = base64Data[i] + const text = await recognizeTextFromBase64(base64) + if (text && text.trim()) { res.push(text) } + } + if (res.length) { return { ...question, Q: res.join(' '), @@ -303,9 +305,6 @@ async function recognizeQuestionImage(question: Question): Promise { type: 'simple', }, } - } catch (e) { - console.error('Error happened in recognizeQuestionImage!') - console.error(e) } } @@ -972,5 +971,4 @@ export { dataToString, doSearch, setNoPossibleAnswersPenalties, - recognizeQuestionImage, } diff --git a/src/utils/tesseract.ts b/src/utils/tesseract.ts index 25a7290..316f8d0 100644 --- a/src/utils/tesseract.ts +++ b/src/utils/tesseract.ts @@ -5,8 +5,12 @@ import { } from 'tesseract.js' import logger from './logger' +import utils from './utils' import { isMainThread, workerData } from 'worker_threads' +let recognizeCount = 0 +const MAX_ALLOWED_RECOGNIZE_COUNT = 100 + // https://github.com/naptha/tesseract.js/blob/master/docs/api.md let tesseractWorker: TesseractWorker = null export async function initTesseractWorker(): Promise { @@ -17,17 +21,6 @@ export async function initTesseractWorker(): Promise { await worker.load() await worker.loadLanguage('hun+eng') await worker.initialize('hun+eng') - return worker - // await worker.terminate(); -} - -let resolveLoaded: () => void = null -export const tesseractLoaded: Promise = new Promise((resolve) => { - resolveLoaded = resolve -}) - -initTesseractWorker().then((worker) => { - tesseractWorker = worker if (isMainThread) { logger.Log('Tesseract loaded on main thread') @@ -35,14 +28,70 @@ initTesseractWorker().then((worker) => { const { workerIndex }: { workerIndex: number } = workerData logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`) } + + return worker +} + +let resolveLoaded: () => void = null +export let tesseractLoaded: Promise = new Promise((resolve) => { + resolveLoaded = resolve +}) + +export async function restartTesseractWorker(): Promise { + tesseractLoaded = new Promise((resolve) => { + resolveLoaded = resolve + }) + + await tesseractWorker.terminate() + tesseractWorker = await initTesseractWorker() + resolveLoaded() +} + +initTesseractWorker().then((worker) => { + tesseractWorker = worker resolveLoaded() }) -export async function recognizeTextFromBase64(base64: string): Promise { - const { - data: { text }, - } = await tesseractWorker.recognize(base64) - return text +export async function recognizeTextFromBase64( + base64: string +): Promise { + await tesseractLoaded + try { + // TODO: somehow integrate confidence + const { + data: { text /*, confidence */ }, + } = await tesseractWorker.recognize(base64) + + recognizeCount += 1 + restartIfNecesarry() + return text + } catch (e) { + logger.Log( + 'Error happened during recognizing base64 text!', + logger.GetColor('redbg') + ) + console.error(e) + + await restartTesseractWorker() + + return null + } +} + +async function restartIfNecesarry() { + if (recognizeCount > MAX_ALLOWED_RECOGNIZE_COUNT) { + logger.Log('Restarting Tesseract worker') + + const memBefore = process.memoryUsage().rss + + await restartTesseractWorker() + + const memAfter = process.memoryUsage().rss + const freed = utils.formatBytes(memBefore - memAfter) + logger.Log(`Restarted tesseract worker, freed up ${freed} memory`) + + recognizeCount = 0 + } } export async function terminateWorker(): Promise { diff --git a/src/utils/utils.ts b/src/utils/utils.ts index aa47677..7ff44b5 100755 --- a/src/utils/utils.ts +++ b/src/utils/utils.ts @@ -36,6 +36,7 @@ export default { statFile: statFile, renameFile: renameFile, deleteDir: deleteDir, + formatBytes: formatBytes, } import fs from 'fs' @@ -296,3 +297,15 @@ function renameFile(oldPath: string, newPath: string): string { return null } } + +function formatBytes(number: number, unit: 'MB' | 'GB' = 'MB'): string { + let res = number / 1024 / 1024 // MB + if (unit === 'MB') { + return `${res} MB` + } + res = res / 1024 + if (unit === 'GB') { + return `${res} GB` + } + return `${number} byte` +}