tesseract worker restart to aviod memory leak

This commit is contained in:
mrfry 2022-12-10 16:36:59 +01:00
parent 96b413a365
commit 0259cfe1a7
3 changed files with 85 additions and 25 deletions

View file

@ -287,14 +287,16 @@ function createQuestion(
async function recognizeQuestionImage(question: Question): Promise<Question> { async function recognizeQuestionImage(question: Question): Promise<Question> {
const base64Data = question.data.base64 const base64Data = question.data.base64
if (Array.isArray(base64Data) && base64Data.length) { if (Array.isArray(base64Data) && base64Data.length) {
try { const res: string[] = []
const res: string[] = [] for (let i = 0; i < base64Data.length; i++) {
for (let i = 0; i < base64Data.length; i++) { const base64 = base64Data[i]
const base64 = base64Data[i] const text = await recognizeTextFromBase64(base64)
const text = await recognizeTextFromBase64(base64) if (text && text.trim()) {
res.push(text) res.push(text)
} }
}
if (res.length) {
return { return {
...question, ...question,
Q: res.join(' '), Q: res.join(' '),
@ -303,9 +305,6 @@ async function recognizeQuestionImage(question: Question): Promise<Question> {
type: 'simple', type: 'simple',
}, },
} }
} catch (e) {
console.error('Error happened in recognizeQuestionImage!')
console.error(e)
} }
} }
@ -972,5 +971,4 @@ export {
dataToString, dataToString,
doSearch, doSearch,
setNoPossibleAnswersPenalties, setNoPossibleAnswersPenalties,
recognizeQuestionImage,
} }

View file

@ -5,8 +5,12 @@ import {
} from 'tesseract.js' } from 'tesseract.js'
import logger from './logger' import logger from './logger'
import utils from './utils'
import { isMainThread, workerData } from 'worker_threads' import { isMainThread, workerData } from 'worker_threads'
let recognizeCount = 0
const MAX_ALLOWED_RECOGNIZE_COUNT = 100
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md // https://github.com/naptha/tesseract.js/blob/master/docs/api.md
let tesseractWorker: TesseractWorker = null let tesseractWorker: TesseractWorker = null
export async function initTesseractWorker(): Promise<TesseractWorker> { export async function initTesseractWorker(): Promise<TesseractWorker> {
@ -17,17 +21,6 @@ export async function initTesseractWorker(): Promise<TesseractWorker> {
await worker.load() await worker.load()
await worker.loadLanguage('hun+eng') await worker.loadLanguage('hun+eng')
await worker.initialize('hun+eng') await worker.initialize('hun+eng')
return worker
// await worker.terminate();
}
let resolveLoaded: () => void = null
export const tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
initTesseractWorker().then((worker) => {
tesseractWorker = worker
if (isMainThread) { if (isMainThread) {
logger.Log('Tesseract loaded on main thread') logger.Log('Tesseract loaded on main thread')
@ -35,14 +28,70 @@ initTesseractWorker().then((worker) => {
const { workerIndex }: { workerIndex: number } = workerData const { workerIndex }: { workerIndex: number } = workerData
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`) logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
} }
return worker
}
let resolveLoaded: () => void = null
export let tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
export async function restartTesseractWorker(): Promise<void> {
tesseractLoaded = new Promise((resolve) => {
resolveLoaded = resolve
})
await tesseractWorker.terminate()
tesseractWorker = await initTesseractWorker()
resolveLoaded()
}
initTesseractWorker().then((worker) => {
tesseractWorker = worker
resolveLoaded() resolveLoaded()
}) })
export async function recognizeTextFromBase64(base64: string): Promise<string> { export async function recognizeTextFromBase64(
const { base64: string
data: { text }, ): Promise<string | null> {
} = await tesseractWorker.recognize(base64) await tesseractLoaded
return text try {
// TODO: somehow integrate confidence
const {
data: { text /*, confidence */ },
} = await tesseractWorker.recognize(base64)
recognizeCount += 1
restartIfNecesarry()
return text
} catch (e) {
logger.Log(
'Error happened during recognizing base64 text!',
logger.GetColor('redbg')
)
console.error(e)
await restartTesseractWorker()
return null
}
}
async function restartIfNecesarry() {
if (recognizeCount > MAX_ALLOWED_RECOGNIZE_COUNT) {
logger.Log('Restarting Tesseract worker')
const memBefore = process.memoryUsage().rss
await restartTesseractWorker()
const memAfter = process.memoryUsage().rss
const freed = utils.formatBytes(memBefore - memAfter)
logger.Log(`Restarted tesseract worker, freed up ${freed} memory`)
recognizeCount = 0
}
} }
export async function terminateWorker(): Promise<void | ConfigResult> { export async function terminateWorker(): Promise<void | ConfigResult> {

View file

@ -36,6 +36,7 @@ export default {
statFile: statFile, statFile: statFile,
renameFile: renameFile, renameFile: renameFile,
deleteDir: deleteDir, deleteDir: deleteDir,
formatBytes: formatBytes,
} }
import fs from 'fs' import fs from 'fs'
@ -296,3 +297,15 @@ function renameFile(oldPath: string, newPath: string): string {
return null return null
} }
} }
function formatBytes(number: number, unit: 'MB' | 'GB' = 'MB'): string {
let res = number / 1024 / 1024 // MB
if (unit === 'MB') {
return `${res} MB`
}
res = res / 1024
if (unit === 'GB') {
return `${res} GB`
}
return `${number} byte`
}