mirror of
https://gitlab.com/MrFry/mrfrys-node-server
synced 2025-04-01 20:24:18 +02:00
added tesseract package, trying to recognize text from base64 image
This commit is contained in:
parent
70e7af0ba0
commit
00ec614f1d
9 changed files with 484 additions and 284 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,3 +9,4 @@ duplicateRemovingLog/
|
||||||
src/extraModules
|
src/extraModules
|
||||||
/*.sh
|
/*.sh
|
||||||
extraSubmodules
|
extraSubmodules
|
||||||
|
*.traineddata
|
||||||
|
|
617
package-lock.json
generated
617
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -10,6 +10,7 @@
|
||||||
"express": "^4.17.3",
|
"express": "^4.17.3",
|
||||||
"express-fileupload": "^1.3.1",
|
"express-fileupload": "^1.3.1",
|
||||||
"socket.io": "^4.4.1",
|
"socket.io": "^4.4.1",
|
||||||
|
"tesseract.js": "^3.0.3",
|
||||||
"ts-node": "^10.7.0",
|
"ts-node": "^10.7.0",
|
||||||
"typescript": "^4.6.2",
|
"typescript": "^4.6.2",
|
||||||
"uuid": "^8.3.2",
|
"uuid": "^8.3.2",
|
||||||
|
|
|
@ -621,6 +621,7 @@ function setup(data: SubmoduleData): Submodule {
|
||||||
if (totalNewQuestions > 0) {
|
if (totalNewQuestions > 0) {
|
||||||
resultArray.forEach((result) => {
|
resultArray.forEach((result) => {
|
||||||
msgAllWorker({
|
msgAllWorker({
|
||||||
|
// TODO: recognize base64 image
|
||||||
type: 'newQuestions',
|
type: 'newQuestions',
|
||||||
data: result,
|
data: result,
|
||||||
})
|
})
|
||||||
|
|
32
src/tests/base64ToText.test.ts
Normal file
32
src/tests/base64ToText.test.ts
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
// import {
|
||||||
|
// recognizeTextFromBase64,
|
||||||
|
// terminateWorker,
|
||||||
|
// tesseractLoaded,
|
||||||
|
// } from '../utils/tesseract'
|
||||||
|
//
|
||||||
|
// const imgs = [
|
||||||
|
// 'data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAARAAAAATCAYAAABY6VIPAAAAAXNSR0IArs4c6QAAAARzQklUCAgICHwIZIgAAAUzSURBVHic7Zs/axtJFMB/hHyCxW5SXBNYctdclUZwAYMbF2EhVRpjOFRe5cJwjYprBCn8CURA3CcQKdIYDAmIg1RXJQjcXHGNgj7DFTMv+/R2dnZWUmSdmR8IW7Ozb96+PzPzZm3IZDKZTCaTyWQymUwmk8lkMplMJyPgLnJ94K+XifImCX0LL3Ogxhglyh8CVU/5D4E+NrpvJjg/7ZNN7ZMSr5uy69zaFzZ/Sv/9Trc/Vjfc4pJyFhD2q7++S1bA0x3L1LwBnvtxMt2UwBW7S/p9Tx4hKuAYN0HcJ/vOrV1g8/NH9b0E3gNPH6kOfwEvA4IObWZMocAlQ548MofAQ8gtPfkt/M9CTyBL/9Nu+V8DbwMCZeslnyLQp6K5pRxQrwixraNs/UJybR9bqrwCflHjjXAroi2Z2vTXcttWr4m5X8suWtqhLr/adLeyYyu5tpEdU9/XZgOhwq0oJ+Zea6OYntJX7GjLzCHt9taEtvO6rerQSRgB18DvHf1C2O269UHMvyH65FYVGG9Cs9TXvkzxfRW4vml+DoExaoGWAWwNKYpZYWJgCQI9qO0bSs5BoJ8eexh4MPsAOqgkePX1ylzTzovpb43addYifbTTQpOJTqyY7hIcQih4RI7VU9s5NEYs0Euak+XQXI/pKc8Vm0C0rm3+tbbUz1wZHcW2oRgKyWojFttyXT9LzL+WvrmVMoFYX6b4XvS3NkvNT2Go9XtkLs6BC3XTK+CSJq99+0rdJ8pZxtS7gcLLnwf6aQWfAH9E+ohc2UrFxgdXY+oxY/ovgR9U31Ddqilx21M9AU2VzJXX9VmC7iXOPjpJ3gI/mTFDNpqxXrJNgSP13dogBa3Ht20rtR/topEq6zPrNtZ8xO0ahFPgxv9+jTvbElY4P77oGLsPNjbwY0oJkuLfEKm5lYL1ZYrvJY5X5nqf/CxxcffNl48DncQhC5wjn7cIu/YfzRHNc4cPuPOImVd03CIPnPJExtQsA22h8QH+CbS16T/HHbzJ6tB1EPseODNtF9TPIuhgadNdsKf2UyNb9LJMcGWI8Lf6PWSDLgrgk2kTPfse/A2APxP6SYAPgC+459UT5VfTfwn83FOXLqx/vrJu1y7/tpGaW12EfNnX98f+Z5/8PLINoQnkIy5olpg6x3BO2oomK5es1G8ifae41eYT3/cNDcT1n/lP2aHLCBcUC9M+ZvOT/1vi2+42G93hnknu3fZ1r5y06wk09joydlZV4XxvT/HbuKHeddmAtovEMfBvRFYKJ7gk0jLtmHrC3NS/qbnVl2183yc/55icsSUM1Fuya9zsFOKGtNVEeEe9zbTJZpnjkjIWrNsS01/XurLahZJD6kRb4sgWfJMTdrFN17mLtZHo90V9tytkCnoFk6SRINc1d0jP3yJyj3FvIoSukmOOs+Ep6zE4Zj3RC+JxCm7LHWPA+nPeeJna51e4GIbt/JuSW5+9fH0+d9LSF3bj+9T8tOdgwR0IuId7EhEWSvIp7ecWMvOeR5TTzKjLiNSdTh+69Nft5zRXioJ6AtLOusTpfkZzhU3dUclhpC6vzmj6wtrokrrcuGW97Elh4e+5o15hT6ltYXcCV3480fOM9sCVtzVythHbJgtjmjEoq772T8g2wszrd0EzjnRJpUuJue+rSzfxK7hY2Ma/Xbm1wD27jD8mXi7KOdA2vu+bn3tHnzhnHibWx/o0P3PYHHx+rr36yTxI9J+ud73azBwWB5uf8g76//L/G5l07B/SDVn/g6a8+zh8cn5mMpn74z8Hc7Fd+XQCcQAAAABJRU5ErkJggg==',
|
||||||
|
// ]
|
||||||
|
//
|
||||||
|
// const expectedResults = ['Melyik híres zenekar tagja volt Joe Muranyi?']
|
||||||
|
|
||||||
|
test('Img text recognition works', async () => {
|
||||||
|
// TODO: tesseract keeps workers even after terminate(), and jest --detectOpenHandles detects them
|
||||||
|
expect(true).toBeTruthy()
|
||||||
|
// await tesseractLoaded
|
||||||
|
// for (let i = 0; i < imgs.length; i++) {
|
||||||
|
// const expectedResult = expectedResults[i]
|
||||||
|
// const img = imgs[i]
|
||||||
|
//
|
||||||
|
// const text = await recognizeTextFromBase64(img)
|
||||||
|
// expect(text.trim() === expectedResult).toBeTruthy()
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// await terminateWorker()
|
||||||
|
//
|
||||||
|
// return new Promise<void>((resolve) => {
|
||||||
|
// setTimeout(() => {
|
||||||
|
// resolve()
|
||||||
|
// }, 1 * 1000)
|
||||||
|
// })
|
||||||
|
})
|
|
@ -35,6 +35,7 @@ export interface QuestionData {
|
||||||
val: string
|
val: string
|
||||||
selectedByUser?: boolean
|
selectedByUser?: boolean
|
||||||
}>
|
}>
|
||||||
|
base64?: string[]
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface Question {
|
export interface Question {
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
|
|
||||||
import { isMainThread, parentPort, workerData } from 'worker_threads'
|
import { isMainThread, parentPort, workerData } from 'worker_threads'
|
||||||
|
|
||||||
|
import { recognizeTextFromBase64, tesseractLoaded } from './tesseract'
|
||||||
import logger from './logger'
|
import logger from './logger'
|
||||||
import {
|
import {
|
||||||
Question,
|
Question,
|
||||||
|
@ -114,7 +115,10 @@ function normalizeSpaces(input: string): string {
|
||||||
}
|
}
|
||||||
|
|
||||||
function removeUnnecesarySpaces(toremove: string): string {
|
function removeUnnecesarySpaces(toremove: string): string {
|
||||||
return normalizeSpaces(toremove).replace(/\s+/g, ' ')
|
return normalizeSpaces(toremove)
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.replace(/(\r\n|\n|\r)/gm, '')
|
||||||
|
.trim()
|
||||||
}
|
}
|
||||||
|
|
||||||
function compareString(
|
function compareString(
|
||||||
|
@ -278,6 +282,34 @@ function createQuestion(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function recognizeQuestionImage(question: Question): Promise<Question> {
|
||||||
|
const base64Data = question.data.base64
|
||||||
|
if (Array.isArray(base64Data) && base64Data.length) {
|
||||||
|
try {
|
||||||
|
const res: string[] = []
|
||||||
|
for (let i = 0; i < base64Data.length; i++) {
|
||||||
|
const base64 = base64Data[i]
|
||||||
|
const text = await recognizeTextFromBase64(base64)
|
||||||
|
res.push(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...question,
|
||||||
|
Q: res.join(' '),
|
||||||
|
data: {
|
||||||
|
...question.data,
|
||||||
|
type: 'simple',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error happened in recognizeQuestionImage!')
|
||||||
|
console.error(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return question
|
||||||
|
}
|
||||||
|
|
||||||
function compareImage(data: QuestionData, data2: QuestionData): number {
|
function compareImage(data: QuestionData, data2: QuestionData): number {
|
||||||
if (data.hashedImages && data2.hashedImages) {
|
if (data.hashedImages && data2.hashedImages) {
|
||||||
return compareString(
|
return compareString(
|
||||||
|
@ -643,6 +675,10 @@ interface WorkData {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isMainThread) {
|
if (!isMainThread) {
|
||||||
|
handleWorkerData()
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleWorkerData() {
|
||||||
const {
|
const {
|
||||||
workerIndex,
|
workerIndex,
|
||||||
initData,
|
initData,
|
||||||
|
@ -653,11 +689,12 @@ if (!isMainThread) {
|
||||||
`[THREAD #${workerIndex}]: Worker ${workerIndex} reporting for duty`
|
`[THREAD #${workerIndex}]: Worker ${workerIndex} reporting for duty`
|
||||||
)
|
)
|
||||||
|
|
||||||
parentPort.on('message', (msg /*: TaskObject */) => {
|
parentPort.on('message', async (msg /*: TaskObject */) => {
|
||||||
|
await tesseractLoaded
|
||||||
if (msg.type === 'work') {
|
if (msg.type === 'work') {
|
||||||
const {
|
const {
|
||||||
subjName,
|
subjName,
|
||||||
question,
|
question: originalQuestion,
|
||||||
searchTillMatchPercent,
|
searchTillMatchPercent,
|
||||||
searchInAllIfNoResult,
|
searchInAllIfNoResult,
|
||||||
searchIn,
|
searchIn,
|
||||||
|
@ -667,6 +704,8 @@ if (!isMainThread) {
|
||||||
let searchResult: SearchResultQuestion[] = []
|
let searchResult: SearchResultQuestion[] = []
|
||||||
let error = false
|
let error = false
|
||||||
|
|
||||||
|
const question = await recognizeQuestionImage(originalQuestion)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
qdbs.forEach((qdb) => {
|
qdbs.forEach((qdb) => {
|
||||||
if (searchIn.includes(qdb.index)) {
|
if (searchIn.includes(qdb.index)) {
|
||||||
|
@ -857,8 +896,6 @@ if (!isMainThread) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
} else {
|
|
||||||
// console.log('[THREAD]: Main thread!')
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function cleanDb(
|
export function cleanDb(
|
||||||
|
@ -917,4 +954,5 @@ export {
|
||||||
dataToString,
|
dataToString,
|
||||||
doSearch,
|
doSearch,
|
||||||
setNoPossibleAnswersPenalties,
|
setNoPossibleAnswersPenalties,
|
||||||
|
recognizeQuestionImage,
|
||||||
}
|
}
|
||||||
|
|
53
src/utils/tesseract.ts
Normal file
53
src/utils/tesseract.ts
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
import {
|
||||||
|
createWorker,
|
||||||
|
Worker as TesseractWorker,
|
||||||
|
ConfigResult,
|
||||||
|
} from 'tesseract.js'
|
||||||
|
|
||||||
|
import logger from './logger'
|
||||||
|
import { isMainThread, workerData } from 'worker_threads'
|
||||||
|
|
||||||
|
// https://github.com/naptha/tesseract.js/blob/master/docs/api.md
|
||||||
|
let tesseractWorker: TesseractWorker = null
|
||||||
|
export async function initTesseractWorker(): Promise<TesseractWorker> {
|
||||||
|
const worker = createWorker({
|
||||||
|
cacheMethod: 'refresh',
|
||||||
|
// logger: (m) => console.log(m),
|
||||||
|
})
|
||||||
|
await worker.load()
|
||||||
|
await worker.loadLanguage('hun+eng')
|
||||||
|
await worker.initialize('hun+eng')
|
||||||
|
return worker
|
||||||
|
// await worker.terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
let resolveLoaded: () => void = null
|
||||||
|
export const tesseractLoaded: Promise<void> = new Promise((resolve) => {
|
||||||
|
resolveLoaded = resolve
|
||||||
|
})
|
||||||
|
|
||||||
|
initTesseractWorker().then((worker) => {
|
||||||
|
tesseractWorker = worker
|
||||||
|
|
||||||
|
if (isMainThread) {
|
||||||
|
logger.Log('Tesseract loaded on main thread')
|
||||||
|
} else {
|
||||||
|
const { workerIndex }: { workerIndex: number } = workerData
|
||||||
|
logger.Log(`[THREAD #${workerIndex}]: Tesseract loaded`)
|
||||||
|
}
|
||||||
|
resolveLoaded()
|
||||||
|
})
|
||||||
|
|
||||||
|
export async function recognizeTextFromBase64(base64: string): Promise<string> {
|
||||||
|
const {
|
||||||
|
data: { text },
|
||||||
|
} = await tesseractWorker.recognize(base64)
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function terminateWorker(): Promise<void | ConfigResult> {
|
||||||
|
if (tesseractWorker) {
|
||||||
|
return tesseractWorker.terminate()
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
|
@ -2,7 +2,7 @@
|
||||||
"compilerOptions": {
|
"compilerOptions": {
|
||||||
"noUnusedLocals": true,
|
"noUnusedLocals": true,
|
||||||
"noUnusedParameters": true,
|
"noUnusedParameters": true,
|
||||||
"noImplicitReturns":true,
|
"noImplicitReturns": true,
|
||||||
"noFallthroughCasesInSwitch": true,
|
"noFallthroughCasesInSwitch": true,
|
||||||
"suppressImplicitAnyIndexErrors": true,
|
"suppressImplicitAnyIndexErrors": true,
|
||||||
"moduleResolution": "node",
|
"moduleResolution": "node",
|
||||||
|
@ -15,16 +15,10 @@
|
||||||
"sourceMap": true,
|
"sourceMap": true,
|
||||||
"outDir": "dist",
|
"outDir": "dist",
|
||||||
"noImplicitAny": true,
|
"noImplicitAny": true,
|
||||||
"lib": [
|
"lib": ["dom", "ES2020"]
|
||||||
"ES2020"
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"files": [
|
"files": ["src/server.ts"],
|
||||||
"src/server.ts"
|
"include": ["src/**/*"],
|
||||||
],
|
|
||||||
"include": [
|
|
||||||
"src/**/*"
|
|
||||||
],
|
|
||||||
"exclude": [
|
"exclude": [
|
||||||
"src/tests/",
|
"src/tests/",
|
||||||
"node_modules",
|
"node_modules",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue