mrfrys-node-server/src/utils/tesseract.ts
2023-04-02 09:12:25 +02:00

101 lines
2.6 KiB
TypeScript

import {
createWorker,
Worker as TesseractWorker,
ConfigResult,
} from 'tesseract.js'
import logger from './logger'
import utils from './utils'
import { isMainThread, workerData } from 'worker_threads'
let recognizeCount = 0
const MAX_ALLOWED_RECOGNIZE_COUNT = 3000 // ~ 500 MB
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
let tesseractWorker: TesseractWorker = null
export async function initTesseractWorker(): Promise<TesseractWorker> {
const worker = await createWorker({
cacheMethod: 'refresh',
// logger: (m) => console.log(m),
})
await worker.loadLanguage('hun+eng')
await worker.initialize('hun+eng')
if (isMainThread) {
logger.Log('Tesseract loaded on main thread')
} else {
const { workerIndex }: { workerIndex: number } = workerData
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
}
return worker
}
let resolveLoaded: () => void = null
export let tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
export async function restartTesseractWorker(): Promise<void> {
tesseractLoaded = new Promise((resolve) => {
resolveLoaded = resolve
})
await tesseractWorker.terminate()
tesseractWorker = await initTesseractWorker()
resolveLoaded()
}
initTesseractWorker().then((worker) => {
tesseractWorker = worker
resolveLoaded()
})
export async function recognizeTextFromBase64(
base64: string
): Promise<string | null> {
await tesseractLoaded
try {
// TODO: somehow integrate confidence
const {
data: { text /*, confidence */ },
} = await tesseractWorker.recognize(base64)
recognizeCount += 1
restartIfNecesarry()
return text
} catch (e) {
logger.Log(
'Error happened during recognizing base64 text!',
logger.GetColor('redbg')
)
console.error(e)
await restartTesseractWorker()
return null
}
}
async function restartIfNecesarry() {
if (recognizeCount > MAX_ALLOWED_RECOGNIZE_COUNT) {
logger.Log('Restarting Tesseract worker')
const memBefore = process.memoryUsage().rss
await restartTesseractWorker()
const memAfter = process.memoryUsage().rss
const freed = utils.formatBytes(memBefore - memAfter)
logger.Log(`Restarted tesseract worker, freed up ${freed} memory`)
recognizeCount = 0
}
}
export async function terminateWorker(): Promise<void | ConfigResult> {
if (tesseractWorker) {
return tesseractWorker.terminate()
}
return
}