mirror of
https://gitlab.com/MrFry/mrfrys-node-server
synced 2025-04-01 20:24:18 +02:00
added tesseract package, trying to recognize text from base64 image
This commit is contained in:
parent
70e7af0ba0
commit
00ec614f1d
9 changed files with 484 additions and 284 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,3 +9,4 @@ duplicateRemovingLog/
|
|||
src/extraModules
|
||||
/*.sh
|
||||
extraSubmodules
|
||||
*.traineddata
|
||||
|
|
617
package-lock.json
generated
617
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -10,6 +10,7 @@
|
|||
"express": "^4.17.3",
|
||||
"express-fileupload": "^1.3.1",
|
||||
"socket.io": "^4.4.1",
|
||||
"tesseract.js": "^3.0.3",
|
||||
"ts-node": "^10.7.0",
|
||||
"typescript": "^4.6.2",
|
||||
"uuid": "^8.3.2",
|
||||
|
|
|
@ -621,6 +621,7 @@ function setup(data: SubmoduleData): Submodule {
|
|||
if (totalNewQuestions > 0) {
|
||||
resultArray.forEach((result) => {
|
||||
msgAllWorker({
|
||||
// TODO: recognize base64 image
|
||||
type: 'newQuestions',
|
||||
data: result,
|
||||
})
|
||||
|
|
32
src/tests/base64ToText.test.ts
Normal file
32
src/tests/base64ToText.test.ts
Normal file
|
@ -0,0 +1,32 @@
|
|||
// import {
|
||||
// recognizeTextFromBase64,
|
||||
// terminateWorker,
|
||||
// tesseractLoaded,
|
||||
// } from '../utils/tesseract'
|
||||
//
|
||||
// const imgs = [
|
||||
// 'data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAARAAAAATCAYAAABY6VIPAAAAAXNSR0IArs4c6QAAAARzQklUCAgICHwIZIgAAAUzSURBVHic7Zs/axtJFMB/hHyCxW5SXBNYctdclUZwAYMbF2EhVRpjOFRe5cJwjYprBCn8CURA3CcQKdIYDAmIg1RXJQjcXHGNgj7DFTMv+/R2dnZWUmSdmR8IW7Ozb96+PzPzZm3IZDKZTCaTyWQymUwmk8lkMplMJyPgLnJ94K+XifImCX0LL3Ogxhglyh8CVU/5D4E+NrpvJjg/7ZNN7ZMSr5uy69zaFzZ/Sv/9Trc/Vjfc4pJyFhD2q7++S1bA0x3L1LwBnvtxMt2UwBW7S/p9Tx4hKuAYN0HcJ/vOrV1g8/NH9b0E3gNPH6kOfwEvA4IObWZMocAlQ548MofAQ8gtPfkt/M9CTyBL/9Nu+V8DbwMCZeslnyLQp6K5pRxQrwixraNs/UJybR9bqrwCflHjjXAroi2Z2vTXcttWr4m5X8suWtqhLr/adLeyYyu5tpEdU9/XZgOhwq0oJ+Zea6OYntJX7GjLzCHt9taEtvO6rerQSRgB18DvHf1C2O269UHMvyH65FYVGG9Cs9TXvkzxfRW4vml+DoExaoGWAWwNKYpZYWJgCQI9qO0bSs5BoJ8eexh4MPsAOqgkePX1ylzTzovpb43addYifbTTQpOJTqyY7hIcQih4RI7VU9s5NEYs0Euak+XQXI/pKc8Vm0C0rm3+tbbUz1wZHcW2oRgKyWojFttyXT9LzL+WvrmVMoFYX6b4XvS3NkvNT2Go9XtkLs6BC3XTK+CSJq99+0rdJ8pZxtS7gcLLnwf6aQWfAH9E+ohc2UrFxgdXY+oxY/ovgR9U31Ddqilx21M9AU2VzJXX9VmC7iXOPjpJ3gI/mTFDNpqxXrJNgSP13dogBa3Ht20rtR/topEq6zPrNtZ8xO0ahFPgxv9+jTvbElY4P77oGLsPNjbwY0oJkuLfEKm5lYL1ZYrvJY5X5nqf/CxxcffNl48DncQhC5wjn7cIu/YfzRHNc4cPuPOImVd03CIPnPJExtQsA22h8QH+CbS16T/HHbzJ6tB1EPseODNtF9TPIuhgadNdsKf2UyNb9LJMcGWI8Lf6PWSDLgrgk2kTPfse/A2APxP6SYAPgC+459UT5VfTfwn83FOXLqx/vrJu1y7/tpGaW12EfNnX98f+Z5/8PLINoQnkIy5olpg6x3BO2oomK5es1G8ifae41eYT3/cNDcT1n/lP2aHLCBcUC9M+ZvOT/1vi2+42G93hnknu3fZ1r5y06wk09joydlZV4XxvT/HbuKHeddmAtovEMfBvRFYKJ7gk0jLtmHrC3NS/qbnVl2183yc/55icsSUM1Fuya9zsFOKGtNVEeEe9zbTJZpnjkjIWrNsS01/XurLahZJD6kRb4sgWfJMTdrFN17mLtZHo90V9tytkCnoFk6SRINc1d0jP3yJyj3FvIoSukmOOs+Ep6zE4Zj3RC+JxCm7LHWPA+nPeeJna51e4GIbt/JuSW5+9fH0+d9LSF3bj+9T8tOdgwR0IuId7EhEWSvIp7ecWMvOeR5TTzKjLiNSdTh+69Nft5zRXioJ6AtLOusTpfkZzhU3dUclhpC6vzmj6wtrokrrcuGW97Elh4e+5o15hT6ltYXcCV3480fOM9sCVtzVythHbJgtjmjEoq772T8g2wszrd0EzjnRJpUuJue+rSzfxK7hY2Ma/Xbm1wD27jD8mXi7KOdA2vu+bn3tHnzhnHibWx/o0P3PYHHx+rr36yTxI9J+ud73azBwWB5uf8g76//L/G5l07B/SDVn/g6a8+zh8cn5mMpn74z8Hc7Fd+XQCcQAAAABJRU5ErkJggg==',
|
||||
// ]
|
||||
//
|
||||
// const expectedResults = ['Melyik híres zenekar tagja volt Joe Muranyi?']
|
||||
|
||||
test('Img text recognition works', async () => {
|
||||
// TODO: tesseract keeps workers even after terminate(), and jest --detectOpenHandles detects them
|
||||
expect(true).toBeTruthy()
|
||||
// await tesseractLoaded
|
||||
// for (let i = 0; i < imgs.length; i++) {
|
||||
// const expectedResult = expectedResults[i]
|
||||
// const img = imgs[i]
|
||||
//
|
||||
// const text = await recognizeTextFromBase64(img)
|
||||
// expect(text.trim() === expectedResult).toBeTruthy()
|
||||
// }
|
||||
//
|
||||
// await terminateWorker()
|
||||
//
|
||||
// return new Promise<void>((resolve) => {
|
||||
// setTimeout(() => {
|
||||
// resolve()
|
||||
// }, 1 * 1000)
|
||||
// })
|
||||
})
|
|
@ -35,6 +35,7 @@ export interface QuestionData {
|
|||
val: string
|
||||
selectedByUser?: boolean
|
||||
}>
|
||||
base64?: string[]
|
||||
}
|
||||
|
||||
export interface Question {
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
import { isMainThread, parentPort, workerData } from 'worker_threads'
|
||||
|
||||
import { recognizeTextFromBase64, tesseractLoaded } from './tesseract'
|
||||
import logger from './logger'
|
||||
import {
|
||||
Question,
|
||||
|
@ -114,7 +115,10 @@ function normalizeSpaces(input: string): string {
|
|||
}
|
||||
|
||||
function removeUnnecesarySpaces(toremove: string): string {
|
||||
return normalizeSpaces(toremove).replace(/\s+/g, ' ')
|
||||
return normalizeSpaces(toremove)
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/(\r\n|\n|\r)/gm, '')
|
||||
.trim()
|
||||
}
|
||||
|
||||
function compareString(
|
||||
|
@ -278,6 +282,34 @@ function createQuestion(
|
|||
}
|
||||
}
|
||||
|
||||
async function recognizeQuestionImage(question: Question): Promise<Question> {
|
||||
const base64Data = question.data.base64
|
||||
if (Array.isArray(base64Data) && base64Data.length) {
|
||||
try {
|
||||
const res: string[] = []
|
||||
for (let i = 0; i < base64Data.length; i++) {
|
||||
const base64 = base64Data[i]
|
||||
const text = await recognizeTextFromBase64(base64)
|
||||
res.push(text)
|
||||
}
|
||||
|
||||
return {
|
||||
...question,
|
||||
Q: res.join(' '),
|
||||
data: {
|
||||
...question.data,
|
||||
type: 'simple',
|
||||
},
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error happened in recognizeQuestionImage!')
|
||||
console.error(e)
|
||||
}
|
||||
}
|
||||
|
||||
return question
|
||||
}
|
||||
|
||||
function compareImage(data: QuestionData, data2: QuestionData): number {
|
||||
if (data.hashedImages && data2.hashedImages) {
|
||||
return compareString(
|
||||
|
@ -643,6 +675,10 @@ interface WorkData {
|
|||
}
|
||||
|
||||
if (!isMainThread) {
|
||||
handleWorkerData()
|
||||
}
|
||||
|
||||
function handleWorkerData() {
|
||||
const {
|
||||
workerIndex,
|
||||
initData,
|
||||
|
@ -653,11 +689,12 @@ if (!isMainThread) {
|
|||
`[THREAD #${workerIndex}]: Worker ${workerIndex} reporting for duty`
|
||||
)
|
||||
|
||||
parentPort.on('message', (msg /*: TaskObject */) => {
|
||||
parentPort.on('message', async (msg /*: TaskObject */) => {
|
||||
await tesseractLoaded
|
||||
if (msg.type === 'work') {
|
||||
const {
|
||||
subjName,
|
||||
question,
|
||||
question: originalQuestion,
|
||||
searchTillMatchPercent,
|
||||
searchInAllIfNoResult,
|
||||
searchIn,
|
||||
|
@ -667,6 +704,8 @@ if (!isMainThread) {
|
|||
let searchResult: SearchResultQuestion[] = []
|
||||
let error = false
|
||||
|
||||
const question = await recognizeQuestionImage(originalQuestion)
|
||||
|
||||
try {
|
||||
qdbs.forEach((qdb) => {
|
||||
if (searchIn.includes(qdb.index)) {
|
||||
|
@ -857,8 +896,6 @@ if (!isMainThread) {
|
|||
})
|
||||
}
|
||||
})
|
||||
} else {
|
||||
// console.log('[THREAD]: Main thread!')
|
||||
}
|
||||
|
||||
export function cleanDb(
|
||||
|
@ -917,4 +954,5 @@ export {
|
|||
dataToString,
|
||||
doSearch,
|
||||
setNoPossibleAnswersPenalties,
|
||||
recognizeQuestionImage,
|
||||
}
|
||||
|
|
53
src/utils/tesseract.ts
Normal file
53
src/utils/tesseract.ts
Normal file
|
@ -0,0 +1,53 @@
|
|||
import {
|
||||
createWorker,
|
||||
Worker as TesseractWorker,
|
||||
ConfigResult,
|
||||
} from 'tesseract.js'
|
||||
|
||||
import logger from './logger'
|
||||
import { isMainThread, workerData } from 'worker_threads'
|
||||
|
||||
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
|
||||
let tesseractWorker: TesseractWorker = null
|
||||
export async function initTesseractWorker(): Promise<TesseractWorker> {
|
||||
const worker = createWorker({
|
||||
cacheMethod: 'refresh',
|
||||
// logger: (m) => console.log(m),
|
||||
})
|
||||
await worker.load()
|
||||
await worker.loadLanguage('hun+eng')
|
||||
await worker.initialize('hun+eng')
|
||||
return worker
|
||||
// await worker.terminate();
|
||||
}
|
||||
|
||||
let resolveLoaded: () => void = null
|
||||
export const tesseractLoaded: Promise<void> = new Promise((resolve) => {
|
||||
resolveLoaded = resolve
|
||||
})
|
||||
|
||||
initTesseractWorker().then((worker) => {
|
||||
tesseractWorker = worker
|
||||
|
||||
if (isMainThread) {
|
||||
logger.Log('Tesseract loaded on main thread')
|
||||
} else {
|
||||
const { workerIndex }: { workerIndex: number } = workerData
|
||||
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
|
||||
}
|
||||
resolveLoaded()
|
||||
})
|
||||
|
||||
export async function recognizeTextFromBase64(base64: string): Promise<string> {
|
||||
const {
|
||||
data: { text },
|
||||
} = await tesseractWorker.recognize(base64)
|
||||
return text
|
||||
}
|
||||
|
||||
export async function terminateWorker(): Promise<void | ConfigResult> {
|
||||
if (tesseractWorker) {
|
||||
return tesseractWorker.terminate()
|
||||
}
|
||||
return
|
||||
}
|
|
@ -2,7 +2,7 @@
|
|||
"compilerOptions": {
|
||||
"noUnusedLocals": true,
|
||||
"noUnusedParameters": true,
|
||||
"noImplicitReturns":true,
|
||||
"noImplicitReturns": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"suppressImplicitAnyIndexErrors": true,
|
||||
"moduleResolution": "node",
|
||||
|
@ -15,16 +15,10 @@
|
|||
"sourceMap": true,
|
||||
"outDir": "dist",
|
||||
"noImplicitAny": true,
|
||||
"lib": [
|
||||
"ES2020"
|
||||
]
|
||||
"lib": ["dom", "ES2020"]
|
||||
},
|
||||
"files": [
|
||||
"src/server.ts"
|
||||
],
|
||||
"include": [
|
||||
"src/**/*"
|
||||
],
|
||||
"files": ["src/server.ts"],
|
||||
"include": ["src/**/*"],
|
||||
"exclude": [
|
||||
"src/tests/",
|
||||
"node_modules",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue