mirror of
https://gitlab.com/MrFry/mrfrys-node-server
synced 2025-04-01 20:24:18 +02:00
101 lines
2.6 KiB
TypeScript
101 lines
2.6 KiB
TypeScript
import {
|
|
createWorker,
|
|
Worker as TesseractWorker,
|
|
ConfigResult,
|
|
} from 'tesseract.js'
|
|
|
|
import logger from './logger'
|
|
import utils from './utils'
|
|
import { isMainThread, workerData } from 'worker_threads'
|
|
|
|
let recognizeCount = 0
|
|
const MAX_ALLOWED_RECOGNIZE_COUNT = 3000 // ~ 500 MB
|
|
|
|
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
|
|
let tesseractWorker: TesseractWorker = null
|
|
export async function initTesseractWorker(): Promise<TesseractWorker> {
|
|
const worker = await createWorker({
|
|
cacheMethod: 'refresh',
|
|
// logger: (m) => console.log(m),
|
|
})
|
|
await worker.loadLanguage('hun+eng')
|
|
await worker.initialize('hun+eng')
|
|
|
|
if (isMainThread) {
|
|
logger.Log('Tesseract loaded on main thread')
|
|
} else {
|
|
const { workerIndex }: { workerIndex: number } = workerData
|
|
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
|
|
}
|
|
|
|
return worker
|
|
}
|
|
|
|
let resolveLoaded: () => void = null
|
|
export let tesseractLoaded: Promise<void> = new Promise((resolve) => {
|
|
resolveLoaded = resolve
|
|
})
|
|
|
|
export async function restartTesseractWorker(): Promise<void> {
|
|
tesseractLoaded = new Promise((resolve) => {
|
|
resolveLoaded = resolve
|
|
})
|
|
|
|
await tesseractWorker.terminate()
|
|
tesseractWorker = await initTesseractWorker()
|
|
resolveLoaded()
|
|
}
|
|
|
|
initTesseractWorker().then((worker) => {
|
|
tesseractWorker = worker
|
|
resolveLoaded()
|
|
})
|
|
|
|
export async function recognizeTextFromBase64(
|
|
base64: string
|
|
): Promise<string | null> {
|
|
await tesseractLoaded
|
|
try {
|
|
// TODO: somehow integrate confidence
|
|
const {
|
|
data: { text /*, confidence */ },
|
|
} = await tesseractWorker.recognize(base64)
|
|
|
|
recognizeCount += 1
|
|
restartIfNecesarry()
|
|
return text
|
|
} catch (e) {
|
|
logger.Log(
|
|
'Error happened during recognizing base64 text!',
|
|
logger.GetColor('redbg')
|
|
)
|
|
console.error(e)
|
|
|
|
await restartTesseractWorker()
|
|
|
|
return null
|
|
}
|
|
}
|
|
|
|
async function restartIfNecesarry() {
|
|
if (recognizeCount > MAX_ALLOWED_RECOGNIZE_COUNT) {
|
|
logger.Log('Restarting Tesseract worker')
|
|
|
|
const memBefore = process.memoryUsage().rss
|
|
|
|
await restartTesseractWorker()
|
|
|
|
const memAfter = process.memoryUsage().rss
|
|
const freed = utils.formatBytes(memBefore - memAfter)
|
|
logger.Log(`Restarted tesseract worker, freed up ${freed} memory`)
|
|
|
|
recognizeCount = 0
|
|
}
|
|
}
|
|
|
|
export async function terminateWorker(): Promise<void | ConfigResult> {
|
|
if (tesseractWorker) {
|
|
return tesseractWorker.terminate()
|
|
}
|
|
return
|
|
}
|