tesseract worker restart to aviod memory leak

This commit is contained in:
mrfry 2022-12-10 16:36:59 +01:00
parent 96b413a365
commit 0259cfe1a7
3 changed files with 85 additions and 25 deletions

View file

@ -287,14 +287,16 @@ function createQuestion(
async function recognizeQuestionImage(question: Question): Promise<Question> {
const base64Data = question.data.base64
if (Array.isArray(base64Data) && base64Data.length) {
try {
const res: string[] = []
for (let i = 0; i < base64Data.length; i++) {
const base64 = base64Data[i]
const text = await recognizeTextFromBase64(base64)
const res: string[] = []
for (let i = 0; i < base64Data.length; i++) {
const base64 = base64Data[i]
const text = await recognizeTextFromBase64(base64)
if (text && text.trim()) {
res.push(text)
}
}
if (res.length) {
return {
...question,
Q: res.join(' '),
@ -303,9 +305,6 @@ async function recognizeQuestionImage(question: Question): Promise<Question> {
type: 'simple',
},
}
} catch (e) {
console.error('Error happened in recognizeQuestionImage!')
console.error(e)
}
}
@ -972,5 +971,4 @@ export {
dataToString,
doSearch,
setNoPossibleAnswersPenalties,
recognizeQuestionImage,
}

View file

@ -5,8 +5,12 @@ import {
} from 'tesseract.js'
import logger from './logger'
import utils from './utils'
import { isMainThread, workerData } from 'worker_threads'
let recognizeCount = 0
const MAX_ALLOWED_RECOGNIZE_COUNT = 100
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
let tesseractWorker: TesseractWorker = null
export async function initTesseractWorker(): Promise<TesseractWorker> {
@ -17,17 +21,6 @@ export async function initTesseractWorker(): Promise<TesseractWorker> {
await worker.load()
await worker.loadLanguage('hun+eng')
await worker.initialize('hun+eng')
return worker
// await worker.terminate();
}
let resolveLoaded: () => void = null
export const tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
initTesseractWorker().then((worker) => {
tesseractWorker = worker
if (isMainThread) {
logger.Log('Tesseract loaded on main thread')
@ -35,14 +28,70 @@ initTesseractWorker().then((worker) => {
const { workerIndex }: { workerIndex: number } = workerData
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
}
return worker
}
let resolveLoaded: () => void = null
export let tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
export async function restartTesseractWorker(): Promise<void> {
tesseractLoaded = new Promise((resolve) => {
resolveLoaded = resolve
})
await tesseractWorker.terminate()
tesseractWorker = await initTesseractWorker()
resolveLoaded()
}
initTesseractWorker().then((worker) => {
tesseractWorker = worker
resolveLoaded()
})
export async function recognizeTextFromBase64(base64: string): Promise<string> {
const {
data: { text },
} = await tesseractWorker.recognize(base64)
return text
export async function recognizeTextFromBase64(
base64: string
): Promise<string | null> {
await tesseractLoaded
try {
// TODO: somehow integrate confidence
const {
data: { text /*, confidence */ },
} = await tesseractWorker.recognize(base64)
recognizeCount += 1
restartIfNecesarry()
return text
} catch (e) {
logger.Log(
'Error happened during recognizing base64 text!',
logger.GetColor('redbg')
)
console.error(e)
await restartTesseractWorker()
return null
}
}
async function restartIfNecesarry() {
if (recognizeCount > MAX_ALLOWED_RECOGNIZE_COUNT) {
logger.Log('Restarting Tesseract worker')
const memBefore = process.memoryUsage().rss
await restartTesseractWorker()
const memAfter = process.memoryUsage().rss
const freed = utils.formatBytes(memBefore - memAfter)
logger.Log(`Restarted tesseract worker, freed up ${freed} memory`)
recognizeCount = 0
}
}
export async function terminateWorker(): Promise<void | ConfigResult> {

View file

@ -36,6 +36,7 @@ export default {
statFile: statFile,
renameFile: renameFile,
deleteDir: deleteDir,
formatBytes: formatBytes,
}
import fs from 'fs'
@ -296,3 +297,15 @@ function renameFile(oldPath: string, newPath: string): string {
return null
}
}
function formatBytes(number: number, unit: 'MB' | 'GB' = 'MB'): string {
let res = number / 1024 / 1024 // MB
if (unit === 'MB') {
return `${res} MB`
}
res = res / 1024
if (unit === 'GB') {
return `${res} GB`
}
return `${number} byte`
}