From 43b8d939c1543153c9b551b37e086f11fb585206 Mon Sep 17 00:00:00 2001 From: mrfry Date: Mon, 29 Mar 2021 18:19:02 +0200 Subject: [PATCH] Duplicate remover: managing possible questions --- src/standaloneUtils/rmDuplicates.js | 204 +++++++++++++++++++++++----- src/utils/classes.ts | 3 +- submodules/qmining-data-editor | 2 +- 3 files changed, 174 insertions(+), 35 deletions(-) diff --git a/src/standaloneUtils/rmDuplicates.js b/src/standaloneUtils/rmDuplicates.js index c49fb98..3bc4fa2 100644 --- a/src/standaloneUtils/rmDuplicates.js +++ b/src/standaloneUtils/rmDuplicates.js @@ -1,7 +1,13 @@ const utils = require('../../dist/utils/utils.js').default // eslint-disable-line const logger = require('../../dist/utils/logger.js').default // eslint-disable-line -const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line +const { + addQuestion, + doSearch, + compareQuestionObj, + createQuestion, +} = require('../../dist/utils/classes.js') // eslint-disable-line const { loadData, writeData } = require('../../dist/utils/actions.js') // eslint-disable-line +const fs = require('fs') // eslint-disable-line // Params [ 'publicDirs/qminingPublic/questionDbs/elektro.json' ] // load: 2.767ms @@ -38,40 +44,149 @@ utils.WriteFile('', globalLog) const params = process.argv.splice(2) -const fileA = params[0] -const fileB = params[1] +const pathA = params[0] +const pathB = params[1] -console.time('load') -const dbA = loadData(fileA) -const dbB = fileB ? loadData(fileB) : null -console.timeEnd('load') +const stat = fs.lstatSync(pathA) +if (stat.isDirectory()) { + if (pathB) { + log( + `Clearing possible questions from ${C( + 'green' + )}${pathA}${C()} based on ${C('green')}${pathB}${C()} db` + ) + const db = pathB ? loadData(pathB) : null -console.time('rmduplicates') -if (!dbB) { - log(`Removing duplicate questions from ${fileA}`) - rmDuplicates(dbA).then((res) => { - console.timeEnd('rmduplicates') - writeData(res, resultDbFileName) - log('File written') - }) + clearPossibleAnswers(pathA, db) + } else { + removePossibleAnswersDuplicates(pathA) + } } else { - log( - `Removing questions found in ${C('green')}${fileB}${C()} from ${C( - 'green' - )}${fileA}${C()}` - ) - difference({ dbA: dbA, dbB: dbB }).then((res) => { - console.timeEnd('rmduplicates') - writeData(res, resultDbFileName) - log('File written') + console.time('load') + const dbA = loadData(pathA) + const dbB = pathB ? loadData(pathB) : null + console.timeEnd('load') + + console.time('rmduplicates') + + if (!dbB) { + log(`Removing duplicate questions from ${C('green')}${pathA}${C()}`) + rmDuplicates(dbA).then((res) => { + console.timeEnd('rmduplicates') + writeData(res, resultDbFileName) + log('File written') + }) + } else { + log( + `Removing questions found in ${C('green')}${pathB}${C()} from ${C( + 'green' + )}${pathA}${C()}` + ) + difference({ dbA: dbA, dbB: dbB }).then((res) => { + console.timeEnd('rmduplicates') + writeData(res, resultDbFileName) + log('File written') + }) + } +} + +// --------------------------------------------------------------------------------- +// possible answers duplicate removing +// --------------------------------------------------------------------------------- + +function removePossibleAnswersDuplicates(path) { + let count = 0 + let currIndex = 1 + let delets = 0 + + iterateDir(path, () => { + count++ }) + + iterateDir(path, (currPath) => { + currIndex++ + if (currPath.includes('savedQuestions.json')) { + return + } + if (!utils.FileExists(currPath)) { + return + } + const currData = utils.ReadJSON(currPath) + currData.questions.forEach((q1) => { + iterateDir(path, (currPath2) => { + if (currPath === currPath2) { + return + } + if (currPath2.includes('savedQuestions.json')) { + return + } + if (!utils.FileExists(currPath)) { + return + } + const dataB = utils.ReadJSON(currPath2) + + dataB.questions.forEach((q2) => { + const percent = compareQuestionObj( + createQuestion(q1), + '', + createQuestion(q2), + '' + ) + if (percent.avg === 100) { + utils.deleteFile(currPath2) + count-- + delets++ + } + }) + }) + }) + + printProgressBar(currIndex, count) + }) + log(`Deleted ${C('green')}${delets}${C()} files`) } -async function rmDuplicates(db) { - return await difference({ dbA: db }) +function clearPossibleAnswers(path, db) { + let count = 0 + let currIndex = 1 + let delets = 0 + iterateDir(path, () => { + count++ + }) + + iterateDir(path, (currPath) => { + currIndex++ + if (currPath.includes('savedQuestions.json')) { + return + } + const { subj, questions } = utils.ReadJSON(currPath) + + questions.forEach((question) => { + const searchRes = search({ + qdb: db, + subjName: subj, + question: question, + searchTillMatchPercent: 80, + }) + if (searchRes.length > 0) { + utils.deleteFile(currPath) + delets++ + } + }) + printProgressBar(currIndex, count) + }) + log(`Deleted ${C('green')}${delets}${C()} files`) } -async function difference({ dbA, dbB }) { +// --------------------------------------------------------------------------------- +// difference +// --------------------------------------------------------------------------------- + +function rmDuplicates(db) { + return difference({ dbA: db }) +} + +function difference({ dbA, dbB }) { const doingDifference = !!dbB // Stuff only from A const resultDb = [] @@ -105,7 +220,7 @@ async function difference({ dbA, dbB }) { for (let j = 0; j < subj.Questions.length; j++) { const question = subj.Questions[j] - const searchRes = await search({ + const searchRes = search({ qdb: doingDifference ? dbB : resultDb, subjName: subj.Name, question: question, @@ -167,11 +282,14 @@ function hasRequiredPercent(result, minpercent) { // --------------------------------------------------------------------------------- function search({ qdb, subjName, question, searchInAllIfNoResult }) { - return new Promise((resolve) => { - resolve( - doSearch(qdb, subjName, question, null, minpercent, searchInAllIfNoResult) - ) - }) + return doSearch( + qdb, + subjName, + question, + null, + minpercent, + searchInAllIfNoResult + ) } function iterateSubjects(db, fn) { @@ -182,6 +300,26 @@ function iterateSubjects(db, fn) { }) } +// --------------------------------------------------------------------------------- +// possible answers tools +// --------------------------------------------------------------------------------- + +function iterateDir(path, action) { + if (!utils.FileExists(path)) { + return + } + + const stat = fs.lstatSync(path) + if (stat.isDirectory()) { + const content = fs.readdirSync(path) + content.forEach((currContent) => { + iterateDir(`${path}/${currContent}`, action) + }) + } else { + action(path) + } +} + // --------------------------------------------------------------------------------- // logging and tools // --------------------------------------------------------------------------------- diff --git a/src/utils/classes.ts b/src/utils/classes.ts index af6180a..685469e 100755 --- a/src/utils/classes.ts +++ b/src/utils/classes.ts @@ -302,7 +302,7 @@ function compareQuestionObj( q2: Question, q2subjName: string, data: QuestionData -) { +): any { assert(data !== undefined || data !== null) assert(q1) assert(typeof q1 === 'object') @@ -678,6 +678,7 @@ if (!isMainThread) { // ------------------------------------------------------------------------ export { + compareQuestionObj, minMatchAmmount, getSubjNameWithoutYear, createQuestion, diff --git a/submodules/qmining-data-editor b/submodules/qmining-data-editor index 33e8b3a..1446a57 160000 --- a/submodules/qmining-data-editor +++ b/submodules/qmining-data-editor @@ -1 +1 @@ -Subproject commit 33e8b3a49e7ddbf5c52721c51e655dc28b6ff877 +Subproject commit 1446a57c28072384b471fcad71e8c637c98ff207