Duplicate remover: managing possible questions

This commit is contained in:
mrfry 2021-03-29 18:19:02 +02:00
parent 934319f984
commit 43b8d939c1
3 changed files with 174 additions and 35 deletions

View file

@ -1,7 +1,13 @@
const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
const {
addQuestion,
doSearch,
compareQuestionObj,
createQuestion,
} = require('../../dist/utils/classes.js') // eslint-disable-line
const { loadData, writeData } = require('../../dist/utils/actions.js') // eslint-disable-line
const fs = require('fs') // eslint-disable-line
// Params [ 'publicDirs/qminingPublic/questionDbs/elektro.json' ]
// load: 2.767ms
@ -38,40 +44,149 @@ utils.WriteFile('', globalLog)
const params = process.argv.splice(2)
const fileA = params[0]
const fileB = params[1]
const pathA = params[0]
const pathB = params[1]
console.time('load')
const dbA = loadData(fileA)
const dbB = fileB ? loadData(fileB) : null
console.timeEnd('load')
const stat = fs.lstatSync(pathA)
if (stat.isDirectory()) {
if (pathB) {
log(
`Clearing possible questions from ${C(
'green'
)}${pathA}${C()} based on ${C('green')}${pathB}${C()} db`
)
const db = pathB ? loadData(pathB) : null
console.time('rmduplicates')
if (!dbB) {
log(`Removing duplicate questions from ${fileA}`)
rmDuplicates(dbA).then((res) => {
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName)
log('File written')
})
clearPossibleAnswers(pathA, db)
} else {
removePossibleAnswersDuplicates(pathA)
}
} else {
log(
`Removing questions found in ${C('green')}${fileB}${C()} from ${C(
'green'
)}${fileA}${C()}`
)
difference({ dbA: dbA, dbB: dbB }).then((res) => {
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName)
log('File written')
console.time('load')
const dbA = loadData(pathA)
const dbB = pathB ? loadData(pathB) : null
console.timeEnd('load')
console.time('rmduplicates')
if (!dbB) {
log(`Removing duplicate questions from ${C('green')}${pathA}${C()}`)
rmDuplicates(dbA).then((res) => {
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName)
log('File written')
})
} else {
log(
`Removing questions found in ${C('green')}${pathB}${C()} from ${C(
'green'
)}${pathA}${C()}`
)
difference({ dbA: dbA, dbB: dbB }).then((res) => {
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName)
log('File written')
})
}
}
// ---------------------------------------------------------------------------------
// possible answers duplicate removing
// ---------------------------------------------------------------------------------
function removePossibleAnswersDuplicates(path) {
let count = 0
let currIndex = 1
let delets = 0
iterateDir(path, () => {
count++
})
iterateDir(path, (currPath) => {
currIndex++
if (currPath.includes('savedQuestions.json')) {
return
}
if (!utils.FileExists(currPath)) {
return
}
const currData = utils.ReadJSON(currPath)
currData.questions.forEach((q1) => {
iterateDir(path, (currPath2) => {
if (currPath === currPath2) {
return
}
if (currPath2.includes('savedQuestions.json')) {
return
}
if (!utils.FileExists(currPath)) {
return
}
const dataB = utils.ReadJSON(currPath2)
dataB.questions.forEach((q2) => {
const percent = compareQuestionObj(
createQuestion(q1),
'',
createQuestion(q2),
''
)
if (percent.avg === 100) {
utils.deleteFile(currPath2)
count--
delets++
}
})
})
})
printProgressBar(currIndex, count)
})
log(`Deleted ${C('green')}${delets}${C()} files`)
}
async function rmDuplicates(db) {
return await difference({ dbA: db })
function clearPossibleAnswers(path, db) {
let count = 0
let currIndex = 1
let delets = 0
iterateDir(path, () => {
count++
})
iterateDir(path, (currPath) => {
currIndex++
if (currPath.includes('savedQuestions.json')) {
return
}
const { subj, questions } = utils.ReadJSON(currPath)
questions.forEach((question) => {
const searchRes = search({
qdb: db,
subjName: subj,
question: question,
searchTillMatchPercent: 80,
})
if (searchRes.length > 0) {
utils.deleteFile(currPath)
delets++
}
})
printProgressBar(currIndex, count)
})
log(`Deleted ${C('green')}${delets}${C()} files`)
}
async function difference({ dbA, dbB }) {
// ---------------------------------------------------------------------------------
// difference
// ---------------------------------------------------------------------------------
function rmDuplicates(db) {
return difference({ dbA: db })
}
function difference({ dbA, dbB }) {
const doingDifference = !!dbB
// Stuff only from A
const resultDb = []
@ -105,7 +220,7 @@ async function difference({ dbA, dbB }) {
for (let j = 0; j < subj.Questions.length; j++) {
const question = subj.Questions[j]
const searchRes = await search({
const searchRes = search({
qdb: doingDifference ? dbB : resultDb,
subjName: subj.Name,
question: question,
@ -167,11 +282,14 @@ function hasRequiredPercent(result, minpercent) {
// ---------------------------------------------------------------------------------
function search({ qdb, subjName, question, searchInAllIfNoResult }) {
return new Promise((resolve) => {
resolve(
doSearch(qdb, subjName, question, null, minpercent, searchInAllIfNoResult)
)
})
return doSearch(
qdb,
subjName,
question,
null,
minpercent,
searchInAllIfNoResult
)
}
function iterateSubjects(db, fn) {
@ -182,6 +300,26 @@ function iterateSubjects(db, fn) {
})
}
// ---------------------------------------------------------------------------------
// possible answers tools
// ---------------------------------------------------------------------------------
function iterateDir(path, action) {
if (!utils.FileExists(path)) {
return
}
const stat = fs.lstatSync(path)
if (stat.isDirectory()) {
const content = fs.readdirSync(path)
content.forEach((currContent) => {
iterateDir(`${path}/${currContent}`, action)
})
} else {
action(path)
}
}
// ---------------------------------------------------------------------------------
// logging and tools
// ---------------------------------------------------------------------------------

View file

@ -302,7 +302,7 @@ function compareQuestionObj(
q2: Question,
q2subjName: string,
data: QuestionData
) {
): any {
assert(data !== undefined || data !== null)
assert(q1)
assert(typeof q1 === 'object')
@ -678,6 +678,7 @@ if (!isMainThread) {
// ------------------------------------------------------------------------
export {
compareQuestionObj,
minMatchAmmount,
getSubjNameWithoutYear,
createQuestion,

@ -1 +1 @@
Subproject commit 33e8b3a49e7ddbf5c52721c51e655dc28b6ff877
Subproject commit 1446a57c28072384b471fcad71e8c637c98ff207