From 8fdc62349ba5ec75373bfbea0fb788f263daf749 Mon Sep 17 00:00:00 2001 From: mrfry Date: Wed, 17 Mar 2021 12:24:50 +0100 Subject: [PATCH] Search speedup by: caching splitted questions/answers, and refactoring string compare algorithym --- src/modules/api/api.ts | 4 +- src/standaloneUtils/rmDuplicates.js | 41 +++++---- src/types/basicTypes.ts | 4 + src/utils/actions.ts | 50 +++++++++- src/utils/classes.ts | 137 +++++++++++++++------------- submodules/qmining-page | 2 +- 6 files changed, 152 insertions(+), 86 deletions(-) diff --git a/src/modules/api/api.ts b/src/modules/api/api.ts index ccfd1a3..ea45717 100644 --- a/src/modules/api/api.ts +++ b/src/modules/api/api.ts @@ -33,6 +33,7 @@ import { processIncomingRequest, logResult, backupData, + writeData, shouldSaveDataFile, shouldSearchDataFile, loadJSON, @@ -1364,6 +1365,7 @@ function GetApp(): ModuleType { function deleteComment(obj, path) { if (path.length === 1) { + // TODO: check if its actually deleteable by user (deleting other users comments) obj.splice(path[0], 1) } else { const i = path.pop() @@ -1888,7 +1890,7 @@ function GetApp(): ModuleType { } if (saveDb) { - utils.WriteFile(JSON.stringify(currDb.data), currDb.path) + writeData(currDb.data, currDb.path) msgAllWorker({ qdbs: questionDbs, type: 'update', diff --git a/src/standaloneUtils/rmDuplicates.js b/src/standaloneUtils/rmDuplicates.js index 62be432..6258993 100644 --- a/src/standaloneUtils/rmDuplicates.js +++ b/src/standaloneUtils/rmDuplicates.js @@ -1,37 +1,41 @@ -const minpercent = 97 -const resultDbFileName = 'res.json' - -// --------------------------------------------------------------------------------------------------- - const utils = require('../../dist/utils/utils.js').default // eslint-disable-line const logger = require('../../dist/utils/logger.js').default // eslint-disable-line const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line +const { loadData } = require('../../dist/utils/actions.js') // eslint-disable-line -// TODO: merge 2 dbs -// TODO: filter questions out from a db based on another, producing a new one - -const params = process.argv.splice(2) - -console.log('Params', params) - -const fileA = params[0] -const fileB = params[1] - -const dbA = utils.ReadJSON(fileA) -const dbB = fileB ? utils.ReadJSON(fileB) : null - +const minpercent = 95 +const resultDbFileName = 'res.json' const line = '====================================================================' const logPath = './duplicateRemovingLog/' utils.CreatePath(logPath) +const params = process.argv.splice(2) + +const fileA = params[0] +const fileB = params[1] + +console.time('load') +const dbA = loadData(fileA) +const dbB = fileB ? loadData(fileB) : null +console.timeEnd('load') + +console.time('rmduplicates') if (!dbB) { + console.log(`Removing duplicate questions from ${fileA}`) rmDuplicates(dbA).then((res) => { + console.timeEnd('rmduplicates') utils.WriteFile(JSON.stringify(res), resultDbFileName) console.log('File written') }) } else { + console.log( + `Removing questions found in ${C('green')}${fileB}${C()} from ${C( + 'green' + )}${fileA}${C()}` + ) difference({ dbA: dbA, dbB: dbB }).then((res) => { + console.timeEnd('rmduplicates') utils.WriteFile(JSON.stringify(res), resultDbFileName) console.log('File written') }) @@ -85,6 +89,7 @@ async function difference({ dbA, dbB }) { subjName: subj.Name, question: question, searchInAllIfNoResult: doingDifference, + searchTillMatchPercent: minpercent, }) printProgressBar(j + 1, subj.Questions.length) diff --git a/src/types/basicTypes.ts b/src/types/basicTypes.ts index 47a38ee..5346df2 100644 --- a/src/types/basicTypes.ts +++ b/src/types/basicTypes.ts @@ -10,6 +10,10 @@ export interface Question { Q: string A: string data: QuestionData + cache?: { + Q: string + A: string + } } export interface Subject { diff --git a/src/utils/actions.ts b/src/utils/actions.ts index a762462..9a6704a 100755 --- a/src/utils/actions.ts +++ b/src/utils/actions.ts @@ -28,7 +28,13 @@ import utils from '../utils/utils' import { SearchResult, addQuestion, getSubjNameWithoutYear } from './classes' // types -import { QuestionDb, Question, User, DataFile } from '../types/basicTypes' +import { + QuestionDb, + Subject, + Question, + User, + DataFile, +} from '../types/basicTypes' // if a recievend question doesnt match at least this % to any other question in the db it gets // added to db @@ -219,7 +225,7 @@ function processIncomingRequestUsingDb( if (currWrites >= writeAfter && !dryRun) { currWrites = 0 logger.DebugLog('Writing data.json', 'isadding', 1) - utils.WriteFile(JSON.stringify(qdb.data), qdb.path) + writeData(qdb.data, qdb.path) } } @@ -335,6 +341,20 @@ export function shouldSaveDataFile( return false } +export function loadData(path: string): Array { + return JSON.parse(utils.ReadFile(path)).reduce((acc, subj) => { + return [ + ...acc, + { + Name: subj.Name, + Questions: subj.Questions.map((question) => { + return createQuestion(question) + }), + }, + ] + }, []) +} + export function loadJSON( dataFiles: Array, dataDir: string @@ -351,7 +371,7 @@ export function loadJSON( ...dataFile, path: dataPath, index: index, - data: JSON.parse(utils.ReadFile(dataPath)), + data: loadData(dataPath), }) } catch (err) { console.error(err) @@ -364,14 +384,34 @@ export function loadJSON( }, []) } +export function writeData(data: Array, path: string): void { + utils.WriteFile( + JSON.stringify( + data.map((subj) => { + return { + Name: subj.Name, + Questions: subj.Questions.map((question) => { + return { + Q: question.Q, + A: question.A, + data: question.data, + } + }), + } + }) + ), + path + ) +} + export function backupData(questionDbs: Array): void { questionDbs.forEach((data) => { const path = './publicDirs/qminingPublic/backs/' utils.CreatePath(path) try { logger.Log(`Backing up ${data.name}...`) - utils.WriteFile( - JSON.stringify(data.data), + writeData( + data.data, `${path}${data.name}_${utils.GetDateString(true)}.json` ) logger.Log('Done') diff --git a/src/utils/classes.ts b/src/utils/classes.ts index bfc7e33..6ca9edd 100755 --- a/src/utils/classes.ts +++ b/src/utils/classes.ts @@ -28,12 +28,12 @@ const commonUselessAnswerParts = [ "'", ] -const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.'] -const specialChars = ['&', '\\+'] +// const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.'] /* Percent minus for length difference */ const lengthDiffMultiplier = 10 /* Minimum ammount to consider that two questions match during answering */ const minMatchAmmount = 70 +const magicNumber = 0.7 // same as minMatchAmmount, but /100 /* If all of the results are below this match percent (when only one subject is searched due to * subject name matching) then all subjects are searched for answer */ const minMatchToNotSearchOtherSubjects = 90 @@ -55,6 +55,14 @@ function getSubjNameWithoutYear(subjName: string): string { // Not exported // --------------------------------------------------------------------------------------------------------- + +function simplifyString(toremove) { + return toremove + .replace(/\s/g, ' ') + .replace(/\s+/g, ' ') + .toLowerCase() +} + function removeStuff( value: string, removableStrings: Array, @@ -67,55 +75,49 @@ function removeStuff( return value } -// removes whitespace from begining and and, and replaces multiple spaces with one space -function removeUnnecesarySpaces(toremove: string) { - assert(toremove) - - toremove = normalizeSpaces(toremove) - while (toremove.includes(' ')) { - toremove = toremove.replace(/ {2}/g, ' ') - } - return toremove.trim() -} - -// simplifies a string for easier comparison -function simplifyStringForComparison(value: string) { - assert(value) - - value = removeUnnecesarySpaces(value).toLowerCase() - return removeStuff(value, commonUselessStringParts) -} - -function removeSpecialChars(value: string) { - assert(value) - - return removeStuff(value, specialChars, ' ') -} - // damn nonbreaking space -function normalizeSpaces(input: string) { - assert(input) - +function normalizeSpaces(input) { return input.replace(/\s/g, ' ') } -function compareString(string1: string, string2: string) { - if (!string1 || !string2) { - if (!string1 && !string2) { +function removeUnnecesarySpaces(toremove: string) { + return normalizeSpaces(toremove).replace(/\s+/g, ' ') +} + +function compareString(s1, s2) { + if (!s1 || !s2) { + if (!s1 && !s2) { + return 100 + } else { + return 0 + } + } + if (s1.length < 0 || s2.length < 0) { + if (s1.length === 0 && s2.length === 0) { return 100 } else { return 0 } } - const s1 = simplifyStringForComparison(string1).split(' ') - const s2 = simplifyStringForComparison(string2).split(' ') let match = 0 - for (let i = 0; i < s1.length; i++) { - if (s2.includes(s1[i])) { - match++ + let lastMatchIndex = -1 + let i = 0 + + while (i < s1.length) { + if (match / i < magicNumber) { + break } + + const currMatchIndex = s2.indexOf(s1[i]) + if (lastMatchIndex < currMatchIndex) { + match++ + lastMatchIndex = currMatchIndex + } + + i++ } + let percent = Math.round(parseFloat(((match / s1.length) * 100).toFixed(2))) const lengthDifference = Math.abs(s2.length - s1.length) percent -= lengthDifference * lengthDiffMultiplier @@ -163,7 +165,6 @@ function simplifyAnswer(value: string) { return value } return simplifyQA(value, [ - removeSpecialChars, removeUnnecesarySpaces, answerPreProcessor, removeAnswerLetters, @@ -175,22 +176,16 @@ function simplifyQuestion(question: Question | string) { return } if (typeof question === 'string') { - return simplifyQA(question, [ - removeSpecialChars, - removeUnnecesarySpaces, - removeAnswerLetters, - ]) + return simplifyQA(question, [removeUnnecesarySpaces, removeAnswerLetters]) } else { if (question.Q) { question.Q = simplifyQA(question.Q, [ - removeSpecialChars, removeUnnecesarySpaces, removeAnswerLetters, ]) } if (question.A) { question.A = simplifyQA(question.A, [ - removeSpecialChars, removeUnnecesarySpaces, removeAnswerLetters, ]) @@ -205,13 +200,29 @@ function simplifyQuestion(question: Question | string) { function createQuestion( question: Question | string, - answer: string, - data: QuestionData + answer?: string, + data?: QuestionData ): Question { - return { - Q: simplifyQuestion(question), - A: answer ? simplifyAnswer(answer) : undefined, - data: data, + try { + if (typeof question === 'string') { + return { + Q: simplifyQuestion(question), + A: answer ? simplifyAnswer(answer) : undefined, + data: data, + } + } else { + return { + ...question, + cache: { + Q: question.Q ? simplifyString(question.Q).split(' ') : [], + A: question.A ? simplifyString(question.A).split(' ') : [], + }, + } + } + } catch (err) { + logger.Log('Error creating question', logger.GetColor('redbg')) + console.error(question, answer, data) + console.error(err) } } @@ -257,11 +268,11 @@ function compareData(q1: Question, q2: Question) { } function compareQuestion(q1: Question, q2: Question) { - return compareString(q1.Q, q2.Q) + return compareString(q1.cache.Q, q2.cache.Q) } function compareAnswer(q1: Question, q2: Question) { - return compareString(q1.A, q2.A) + return compareString(q1.cache.A, q2.cache.A) } function compareQuestionObj( @@ -328,7 +339,11 @@ function searchSubject( assert(question) let result = [] - subj.Questions.every((currentQuestion) => { + + let stopSearch = false + let i = subj.Questions.length - 1 + while (i >= 0 && !stopSearch) { + const currentQuestion = subj.Questions[i] const percent = compareQuestionObj( currentQuestion, subjName, @@ -337,7 +352,7 @@ function searchSubject( question.data ) - if (percent.avg > minMatchAmmount) { + if (percent.avg >= minMatchAmmount) { result.push({ q: currentQuestion, match: percent.avg, @@ -346,11 +361,11 @@ function searchSubject( } if (searchTillMatchPercent && percent.avg >= searchTillMatchPercent) { - return false + stopSearch = true } - return true - }) + i-- + } result = result.sort((q1, q2) => { if (q1.match < q2.match) { @@ -421,9 +436,9 @@ function prepareQuestion( let preparedQuestion: Question if (typeof question === 'object') { - preparedQuestion = question + preparedQuestion = createQuestion(question) } else { - let parsedData + let parsedData: any if (typeof data === 'string') { try { parsedData = JSON.parse(data) diff --git a/submodules/qmining-page b/submodules/qmining-page index 49eae83..7f41637 160000 --- a/submodules/qmining-page +++ b/submodules/qmining-page @@ -1 +1 @@ -Subproject commit 49eae83f8194ab9585939b93119f82f7c0da16bb +Subproject commit 7f4163736cc0bfed3259f39f7bc0063ca191da21