tesseract worker restart to aviod memory leak

This commit is contained in:
mrfry 2022-12-10 16:36:59 +01:00
parent 96b413a365
commit 0259cfe1a7
3 changed files with 85 additions and 25 deletions

View file

@ -5,8 +5,12 @@ import {
} from 'tesseract.js'
import logger from './logger'
import utils from './utils'
import { isMainThread, workerData } from 'worker_threads'
let recognizeCount = 0
const MAX_ALLOWED_RECOGNIZE_COUNT = 100
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
let tesseractWorker: TesseractWorker = null
export async function initTesseractWorker(): Promise<TesseractWorker> {
@ -17,17 +21,6 @@ export async function initTesseractWorker(): Promise<TesseractWorker> {
await worker.load()
await worker.loadLanguage('hun+eng')
await worker.initialize('hun+eng')
return worker
// await worker.terminate();
}
let resolveLoaded: () => void = null
export const tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
initTesseractWorker().then((worker) => {
tesseractWorker = worker
if (isMainThread) {
logger.Log('Tesseract loaded on main thread')
@ -35,14 +28,70 @@ initTesseractWorker().then((worker) => {
const { workerIndex }: { workerIndex: number } = workerData
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
}
return worker
}
let resolveLoaded: () => void = null
export let tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
export async function restartTesseractWorker(): Promise<void> {
tesseractLoaded = new Promise((resolve) => {
resolveLoaded = resolve
})
await tesseractWorker.terminate()
tesseractWorker = await initTesseractWorker()
resolveLoaded()
}
initTesseractWorker().then((worker) => {
tesseractWorker = worker
resolveLoaded()
})
export async function recognizeTextFromBase64(base64: string): Promise<string> {
const {
data: { text },
} = await tesseractWorker.recognize(base64)
return text
export async function recognizeTextFromBase64(
base64: string
): Promise<string | null> {
await tesseractLoaded
try {
// TODO: somehow integrate confidence
const {
data: { text /*, confidence */ },
} = await tesseractWorker.recognize(base64)
recognizeCount += 1
restartIfNecesarry()
return text
} catch (e) {
logger.Log(
'Error happened during recognizing base64 text!',
logger.GetColor('redbg')
)
console.error(e)
await restartTesseractWorker()
return null
}
}
async function restartIfNecesarry() {
if (recognizeCount > MAX_ALLOWED_RECOGNIZE_COUNT) {
logger.Log('Restarting Tesseract worker')
const memBefore = process.memoryUsage().rss
await restartTesseractWorker()
const memAfter = process.memoryUsage().rss
const freed = utils.formatBytes(memBefore - memAfter)
logger.Log(`Restarted tesseract worker, freed up ${freed} memory`)
recognizeCount = 0
}
}
export async function terminateWorker(): Promise<void | ConfigResult> {