mrfrys-node-server/src/standaloneUtils/rmDuplicates.js
2022-12-10 15:34:54 +01:00

500 lines
14 KiB
JavaScript

const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
const {
addQuestion,
doSearch,
compareQuestionObj,
createQuestion,
} = require('../../dist/utils/classes.js') // eslint-disable-line
const { loadData, writeData } = require('../../dist/utils/actions.js') // eslint-disable-line
const fs = require('fs') // eslint-disable-line
// Params [ 'publicDirs/qminingPublic/questionDbs/elektro.json' ]
// load: 2.767ms
// ==============================================================================================
// 1 / 1: Elektronika, 826 questions
// Result length: 0, original length: 826
// [=====================================================================] 826 / 826
// Removed 19 questions
// ==============================================================================================
// Result length: 807, original length: 826, removed 19 questions
// rmduplicates: 9.527s
// File written
// load: 15.91ms
// Removing duplicate questions from publicDirs/qminingPublic/questionDbs/elektro.json
// ==============================================================================================
// 1 / 1: Elektronika, 826 questions
// Result length: 0, original length: 826
// [=====================================================================] 826 / 826
// Removed 10 questions
// ==============================================================================================
// Result length: 816, original length: 826, removed 10 questions
// rmduplicates: 488.853ms
// File written
const minpercent = 95
const line =
'===================================================================='
const logPath = './duplicateRemovingLog/'
const globalLog = './duplicateRemovingLog/log'
utils.CreatePath(logPath)
utils.WriteFile('', globalLog)
// ----------------------------------------------
let currentMaxIndex = -1
let currentIndex = -1
process.on('message', function () {
process.send({
currentMaxIndex: currentMaxIndex,
currentIndex: currentIndex,
})
})
// ----------------------------------------------
let params = process.argv.splice(2)
let silenced = false
if (params.includes('-s')) {
silenced = true
}
params = params.filter((x) => {
return !x.startsWith('-')
})
console.log(params)
if (params.length === 0) {
console.log('At least 1 parameter required (path to DB)')
process.exit(1)
}
const pathA = params[0]
const pathB = params[1]
const stat = fs.lstatSync(pathA)
if (stat.isDirectory()) {
if (pathB) {
log(
`Clearing possible questions from ${C(
'green'
)}${pathA}${C()} based on ${C('green')}${pathB}${C()} db`
)
const db = pathB ? loadData(pathB) : null
clearPossibleAnswers(pathA, db)
log(
`Cleared possible questions from ${C(
'green'
)}${pathA}${C()} based on ${C('green')}${pathB}${C()} db`
)
} else {
log(
`Removing possible question duplicates from ${C(
'green'
)}${pathA}${C()}`
)
removePossibleAnswersDuplicates(pathA)
log(
`Removed possible question duplicates from ${C(
'green'
)}${pathA}${C()}`
)
}
} else {
console.time('load')
const dbA = loadData(pathA)
const dbB = pathB ? loadData(pathB) : null
console.timeEnd('load')
console.time('rmduplicates')
if (!dbB) {
log(`Removing duplicate questions from ${C('green')}${pathA}${C()}`)
const resultDbFileName = pathA.split('/')[pathA.split('/').length - 1]
const res = rmDuplicates(dbA)
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName + '.res')
log('File written')
log(`Removed duplicate questions from ${C('green')}${pathA}${C()}`)
} else {
log(
`Removing questions found in ${C('green')}${pathB}${C()} from ${C(
'green'
)}${pathA}${C()}`
)
const res = difference({ dbA: dbA, dbB: dbB })
console.timeEnd('rmduplicates')
const resultDbFileName = pathA.split('/')[pathA.split('/').length - 1]
writeData(res, resultDbFileName + '.res')
log('File written')
log(
`Removed questions found in ${C('green')}${pathB}${C()} from ${C(
'green'
)}${pathA}${C()}`
)
}
}
// ---------------------------------------------------------------------------------
// possible answers duplicate removing
// ---------------------------------------------------------------------------------
// TODO: dont check every file, only check per directorires
// only compare questions of same subjects
function removePossibleAnswersDuplicates(path) {
const dirs = fs.readdirSync(path)
let count = 0
let currIndex = 1
let delets = 0
iterateDir(path, () => {
count++
})
dirs.forEach((currDir) => {
const contents = fs.readdirSync(path + '/' + currDir)
contents.forEach((currFile) => {
const currPath = path + '/' + currDir + '/' + currFile
if (currPath.includes('savedQuestions.json')) {
return
}
if (!utils.FileExists(currPath)) {
return
}
const dataA = utils.ReadJSON(currPath)
currIndex++
printProgressBar(currIndex, count - 1)
contents.forEach((currFile2) => {
const currPath2 = path + '/' + currDir + '/' + currFile2
if (currPath2.includes('savedQuestions.json')) {
return
}
if (!utils.FileExists(currPath2)) {
return
}
if (currPath === currPath2) {
return
}
const dataB = utils.ReadJSON(currPath2)
dataA.questions.forEach((q1) => {
dataB.questions.some((q2) => {
const percent = compareQuestionObj(
createQuestion(q1),
'',
createQuestion(q2),
''
)
if (percent.avg === 100) {
utils.deleteFile(currPath2)
count--
delets++
return true
}
})
})
})
})
})
log(`${C('green')}Deleting empty directories ...${C()}`)
count = dirs.length
currIndex = 0
let deletedDirCount = 0
dirs.forEach((dir) => {
currIndex++
const currDirContent = fs.readdirSync(path + '/' + dir)
if (currDirContent.length === 0) {
fs.rmdirSync(path + '/' + dir)
deletedDirCount++
}
printProgressBar(currIndex, count)
})
log(`${C('green')}Updating savedQuestions.json ...${C()}`)
count = dirs.length
currIndex = 0
dirs.forEach((dir) => {
currIndex++
updateSavedQuestionsFile(path + '/' + dir)
printProgressBar(currIndex, count)
})
log(
`Deleted ${C('green')}${delets}${C()} files, and ${C(
'green'
)}${deletedDirCount}${C()} directories`
)
}
function clearPossibleAnswers(path, db) {
let count = 0
let currIndex = 1
let delets = 0
iterateDir(path, () => {
count++
})
iterateDir(path, (currPath) => {
currIndex++
if (currPath.includes('savedQuestions.json')) {
return
}
const { subj, questions } = utils.ReadJSON(currPath)
questions.forEach((question) => {
const searchRes = search({
qdb: db,
subjName: subj,
question: question,
searchTillMatchPercent: 80,
})
if (searchRes.length > 0) {
utils.deleteFile(currPath)
delets++
}
})
printProgressBar(currIndex, count)
})
log(`Deleted ${C('green')}${delets}${C()} files`)
}
function updateSavedQuestionsFile(path) {
const filePath = path + '/' + 'savedQuestions.json'
if (!utils.FileExists(filePath)) {
log(`${filePath} does not exists!`)
return
}
const savedQuestions = utils.ReadJSON(filePath)
const filtered = savedQuestions.filter((sq) => {
return utils.FileExists(path + '/' + sq.fname)
})
if (savedQuestions.length !== filtered.length) {
utils.WriteFile(JSON.stringify(filtered), filePath)
}
}
// ---------------------------------------------------------------------------------
// difference
// ---------------------------------------------------------------------------------
function rmDuplicates(db) {
return difference({ dbA: db })
}
function difference({ dbA, dbB }) {
const doingDifference = !!dbB
// Stuff only from A
const resultDb = []
let dbLength = 0
let removedTotal = 0
let processedQuestions = 0
iterateSubjects(dbA, () => {
dbLength++
})
currentMaxIndex = dbLength
const getResultDbLength = () => {
let resultDbLength = 0
iterateSubjects(resultDb, () => {
resultDbLength++
})
return resultDbLength
}
for (let i = 0; i < dbA.length; i++) {
const subj = dbA[i]
const subjLogPath = logPath + subj.Name
utils.WriteFile('', subjLogPath)
let removedCount = 0
hr()
log(
`${C('blue')}${i + 1} / ${dbA.length}: ${C('green')}${
subj.Name
}, ${C('blue')}${subj.Questions.length}${C(
'green'
)} questions${C()}`
)
printProgressBar(i + 1, dbA.length)
for (let j = 0; j < subj.Questions.length; j++) {
const question = subj.Questions[j]
const searchRes = search({
qdb: doingDifference ? dbB : resultDb,
subjName: subj.Name,
question: question,
searchInAllIfNoResult: doingDifference,
searchTillMatchPercent: minpercent,
})
printProgressBar(processedQuestions, dbLength)
processedQuestions++
currentIndex = processedQuestions
const res = hasRequiredPercent(searchRes, minpercent)
// no result: adding to difference
if (res.length === 0) {
// no result: adding to difference
addQuestion(resultDb, subj.Name, question)
} else {
// has result, not adding to difference
utils.AppendToFile(
line +
'\n' +
line +
'\n' +
JSON.stringify(question, null, 2) +
'\n' +
line +
JSON.stringify(res, null, 2) +
'\n',
subjLogPath
)
removedCount++
removedTotal++
}
}
log(
`${C('yellow')}Removed ${C('red')}${removedCount}${C(
'yellow'
)} questions${C()}`
)
}
hr()
log(
`Result length: ${getResultDbLength()}, original length: ${dbLength}, removed ${removedTotal} questions`
)
return resultDb
}
function hasRequiredPercent(result, minpercent) {
return result.reduce((acc, res) => {
if (res.match >= minpercent) {
acc.push(res)
}
return acc
}, [])
}
// ---------------------------------------------------------------------------------
// db editing tools
// ---------------------------------------------------------------------------------
function search({ qdb, subjName, question, searchInAllIfNoResult }) {
return doSearch(
qdb,
subjName,
question,
null,
minpercent,
searchInAllIfNoResult
)
}
function iterateSubjects(db, fn) {
db.forEach((subj) => {
subj.Questions.forEach((question) => {
fn(subj, question)
})
})
}
// ---------------------------------------------------------------------------------
// possible answers tools
// ---------------------------------------------------------------------------------
function iterateDir(path, action) {
if (!utils.FileExists(path)) {
return
}
const stat = fs.lstatSync(path)
if (stat.isDirectory()) {
const content = fs.readdirSync(path)
content.forEach((currContent) => {
iterateDir(`${path}/${currContent}`, action)
})
} else {
action(path)
}
}
// ---------------------------------------------------------------------------------
// logging and tools
// ---------------------------------------------------------------------------------
function hr() {
let res = ''
for (let i = 0; i < process.stdout.columns; i++) {
res += '='
}
log(`${C('cyan')}${res}${C()}`)
}
function log(text) {
utils.AppendToFile(text, globalLog)
if (silenced) return
if (process.stdout.isTTY) {
process.stdout.clearLine()
process.stdout.cursorTo(0)
}
console.log(text)
}
function writeInSameLine(text, returnToLineStart) {
if (!process.stdout.isTTY) {
return
}
process.stdout.clearLine()
process.stdout.cursorTo(0)
process.stdout.write(text)
if (returnToLineStart) {
process.stdout.write('\r')
} else {
process.stdout.write('\n')
}
}
function printProgressBar(current, total) {
if (!process.stdout.isTTY || silenced) {
return
}
const width = process.stdout.columns - 30
if (width <= 0) {
return
}
const x = width / total
const xCurrent = Math.floor(current * x)
const xTotal = Math.floor(total * x)
let line = ''
for (let i = 0; i < xCurrent; i++) {
line += '='
}
for (let i = 0; i < xTotal - xCurrent; i++) {
line += ' '
}
const numbers = `${current} / ${total}`
writeInSameLine(
`${C('magenta')} [${line}]${C('green')} ${numbers}${C()}`,
current !== total
)
}
function C(color) {
return logger.C(color)
}
process.exit()