added tesseract package, trying to recognize text from base64 image

This commit is contained in:
mrfry 2022-11-23 21:47:07 +01:00
parent 70e7af0ba0
commit 00ec614f1d
9 changed files with 484 additions and 284 deletions

1
.gitignore vendored
View file

@ -9,3 +9,4 @@ duplicateRemovingLog/
src/extraModules
/*.sh
extraSubmodules
*.traineddata

617
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,7 @@
"express": "^4.17.3",
"express-fileupload": "^1.3.1",
"socket.io": "^4.4.1",
"tesseract.js": "^3.0.3",
"ts-node": "^10.7.0",
"typescript": "^4.6.2",
"uuid": "^8.3.2",

View file

@ -621,6 +621,7 @@ function setup(data: SubmoduleData): Submodule {
if (totalNewQuestions > 0) {
resultArray.forEach((result) => {
msgAllWorker({
// TODO: recognize base64 image
type: 'newQuestions',
data: result,
})

View file

@ -0,0 +1,32 @@
// import {
// recognizeTextFromBase64,
// terminateWorker,
// tesseractLoaded,
// } from '../utils/tesseract'
//
// const imgs = [
// 'data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAARAAAAATCAYAAABY6VIPAAAAAXNSR0IArs4c6QAAAARzQklUCAgICHwIZIgAAAUzSURBVHic7Zs/axtJFMB/hHyCxW5SXBNYctdclUZwAYMbF2EhVRpjOFRe5cJwjYprBCn8CURA3CcQKdIYDAmIg1RXJQjcXHGNgj7DFTMv+/R2dnZWUmSdmR8IW7Ozb96+PzPzZm3IZDKZTCaTyWQymUwmk8lkMplMJyPgLnJ94K+XifImCX0LL3Ogxhglyh8CVU/5D4E+NrpvJjg/7ZNN7ZMSr5uy69zaFzZ/Sv/9Trc/Vjfc4pJyFhD2q7++S1bA0x3L1LwBnvtxMt2UwBW7S/p9Tx4hKuAYN0HcJ/vOrV1g8/NH9b0E3gNPH6kOfwEvA4IObWZMocAlQ548MofAQ8gtPfkt/M9CTyBL/9Nu+V8DbwMCZeslnyLQp6K5pRxQrwixraNs/UJybR9bqrwCflHjjXAroi2Z2vTXcttWr4m5X8suWtqhLr/adLeyYyu5tpEdU9/XZgOhwq0oJ+Zea6OYntJX7GjLzCHt9taEtvO6rerQSRgB18DvHf1C2O269UHMvyH65FYVGG9Cs9TXvkzxfRW4vml+DoExaoGWAWwNKYpZYWJgCQI9qO0bSs5BoJ8eexh4MPsAOqgkePX1ylzTzovpb43addYifbTTQpOJTqyY7hIcQih4RI7VU9s5NEYs0Euak+XQXI/pKc8Vm0C0rm3+tbbUz1wZHcW2oRgKyWojFttyXT9LzL+WvrmVMoFYX6b4XvS3NkvNT2Go9XtkLs6BC3XTK+CSJq99+0rdJ8pZxtS7gcLLnwf6aQWfAH9E+ohc2UrFxgdXY+oxY/ovgR9U31Ddqilx21M9AU2VzJXX9VmC7iXOPjpJ3gI/mTFDNpqxXrJNgSP13dogBa3Ht20rtR/topEq6zPrNtZ8xO0ahFPgxv9+jTvbElY4P77oGLsPNjbwY0oJkuLfEKm5lYL1ZYrvJY5X5nqf/CxxcffNl48DncQhC5wjn7cIu/YfzRHNc4cPuPOImVd03CIPnPJExtQsA22h8QH+CbS16T/HHbzJ6tB1EPseODNtF9TPIuhgadNdsKf2UyNb9LJMcGWI8Lf6PWSDLgrgk2kTPfse/A2APxP6SYAPgC+459UT5VfTfwn83FOXLqx/vrJu1y7/tpGaW12EfNnX98f+Z5/8PLINoQnkIy5olpg6x3BO2oomK5es1G8ifae41eYT3/cNDcT1n/lP2aHLCBcUC9M+ZvOT/1vi2+42G93hnknu3fZ1r5y06wk09joydlZV4XxvT/HbuKHeddmAtovEMfBvRFYKJ7gk0jLtmHrC3NS/qbnVl2183yc/55icsSUM1Fuya9zsFOKGtNVEeEe9zbTJZpnjkjIWrNsS01/XurLahZJD6kRb4sgWfJMTdrFN17mLtZHo90V9tytkCnoFk6SRINc1d0jP3yJyj3FvIoSukmOOs+Ep6zE4Zj3RC+JxCm7LHWPA+nPeeJna51e4GIbt/JuSW5+9fH0+d9LSF3bj+9T8tOdgwR0IuId7EhEWSvIp7ecWMvOeR5TTzKjLiNSdTh+69Nft5zRXioJ6AtLOusTpfkZzhU3dUclhpC6vzmj6wtrokrrcuGW97Elh4e+5o15hT6ltYXcCV3480fOM9sCVtzVythHbJgtjmjEoq772T8g2wszrd0EzjnRJpUuJue+rSzfxK7hY2Ma/Xbm1wD27jD8mXi7KOdA2vu+bn3tHnzhnHibWx/o0P3PYHHx+rr36yTxI9J+ud73azBwWB5uf8g76//L/G5l07B/SDVn/g6a8+zh8cn5mMpn74z8Hc7Fd+XQCcQAAAABJRU5ErkJggg==',
// ]
//
// const expectedResults = ['Melyik híres zenekar tagja volt Joe Muranyi?']
test('Img text recognition works', async () => {
// TODO: tesseract keeps workers even after terminate(), and jest --detectOpenHandles detects them
expect(true).toBeTruthy()
// await tesseractLoaded
// for (let i = 0; i < imgs.length; i++) {
// const expectedResult = expectedResults[i]
// const img = imgs[i]
//
// const text = await recognizeTextFromBase64(img)
// expect(text.trim() === expectedResult).toBeTruthy()
// }
//
// await terminateWorker()
//
// return new Promise<void>((resolve) => {
// setTimeout(() => {
// resolve()
// }, 1 * 1000)
// })
})

View file

@ -35,6 +35,7 @@ export interface QuestionData {
val: string
selectedByUser?: boolean
}>
base64?: string[]
}
export interface Question {

View file

@ -21,6 +21,7 @@
import { isMainThread, parentPort, workerData } from 'worker_threads'
import { recognizeTextFromBase64, tesseractLoaded } from './tesseract'
import logger from './logger'
import {
Question,
@ -114,7 +115,10 @@ function normalizeSpaces(input: string): string {
}
function removeUnnecesarySpaces(toremove: string): string {
return normalizeSpaces(toremove).replace(/\s+/g, ' ')
return normalizeSpaces(toremove)
.replace(/\s+/g, ' ')
.replace(/(\r\n|\n|\r)/gm, '')
.trim()
}
function compareString(
@ -278,6 +282,34 @@ function createQuestion(
}
}
async function recognizeQuestionImage(question: Question): Promise<Question> {
const base64Data = question.data.base64
if (Array.isArray(base64Data) && base64Data.length) {
try {
const res: string[] = []
for (let i = 0; i < base64Data.length; i++) {
const base64 = base64Data[i]
const text = await recognizeTextFromBase64(base64)
res.push(text)
}
return {
...question,
Q: res.join(' '),
data: {
...question.data,
type: 'simple',
},
}
} catch (e) {
console.error('Error happened in recognizeQuestionImage!')
console.error(e)
}
}
return question
}
function compareImage(data: QuestionData, data2: QuestionData): number {
if (data.hashedImages && data2.hashedImages) {
return compareString(
@ -643,6 +675,10 @@ interface WorkData {
}
if (!isMainThread) {
handleWorkerData()
}
function handleWorkerData() {
const {
workerIndex,
initData,
@ -653,11 +689,12 @@ if (!isMainThread) {
`[THREAD #${workerIndex}]: Worker ${workerIndex} reporting for duty`
)
parentPort.on('message', (msg /*: TaskObject */) => {
parentPort.on('message', async (msg /*: TaskObject */) => {
await tesseractLoaded
if (msg.type === 'work') {
const {
subjName,
question,
question: originalQuestion,
searchTillMatchPercent,
searchInAllIfNoResult,
searchIn,
@ -667,6 +704,8 @@ if (!isMainThread) {
let searchResult: SearchResultQuestion[] = []
let error = false
const question = await recognizeQuestionImage(originalQuestion)
try {
qdbs.forEach((qdb) => {
if (searchIn.includes(qdb.index)) {
@ -857,8 +896,6 @@ if (!isMainThread) {
})
}
})
} else {
// console.log('[THREAD]: Main thread!')
}
export function cleanDb(
@ -917,4 +954,5 @@ export {
dataToString,
doSearch,
setNoPossibleAnswersPenalties,
recognizeQuestionImage,
}

53
src/utils/tesseract.ts Normal file
View file

@ -0,0 +1,53 @@
import {
createWorker,
Worker as TesseractWorker,
ConfigResult,
} from 'tesseract.js'
import logger from './logger'
import { isMainThread, workerData } from 'worker_threads'
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
let tesseractWorker: TesseractWorker = null
export async function initTesseractWorker(): Promise<TesseractWorker> {
const worker = createWorker({
cacheMethod: 'refresh',
// logger: (m) => console.log(m),
})
await worker.load()
await worker.loadLanguage('hun+eng')
await worker.initialize('hun+eng')
return worker
// await worker.terminate();
}
let resolveLoaded: () => void = null
export const tesseractLoaded: Promise<void> = new Promise((resolve) => {
resolveLoaded = resolve
})
initTesseractWorker().then((worker) => {
tesseractWorker = worker
if (isMainThread) {
logger.Log('Tesseract loaded on main thread')
} else {
const { workerIndex }: { workerIndex: number } = workerData
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
}
resolveLoaded()
})
export async function recognizeTextFromBase64(base64: string): Promise<string> {
const {
data: { text },
} = await tesseractWorker.recognize(base64)
return text
}
export async function terminateWorker(): Promise<void | ConfigResult> {
if (tesseractWorker) {
return tesseractWorker.terminate()
}
return
}

View file

@ -15,16 +15,10 @@
"sourceMap": true,
"outDir": "dist",
"noImplicitAny": true,
"lib": [
"ES2020"
]
"lib": ["dom", "ES2020"]
},
"files": [
"src/server.ts"
],
"include": [
"src/**/*"
],
"files": ["src/server.ts"],
"include": ["src/**/*"],
"exclude": [
"src/tests/",
"node_modules",