mrfrys-node-server/src/standaloneUtils/rmDuplicates.js

239 lines
6.4 KiB
JavaScript

const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
const { loadData, writeData } = require('../../dist/utils/actions.js') // eslint-disable-line
// Params [ 'publicDirs/qminingPublic/questionDbs/elektro.json' ]
// load: 2.767ms
// ==============================================================================================
// 1 / 1: Elektronika, 826 questions
// Result length: 0, original length: 826
// [=====================================================================] 826 / 826
// Removed 19 questions
// ==============================================================================================
// Result length: 807, original length: 826, removed 19 questions
// rmduplicates: 9.527s
// File written
// load: 15.91ms
// Removing duplicate questions from publicDirs/qminingPublic/questionDbs/elektro.json
// ==============================================================================================
// 1 / 1: Elektronika, 826 questions
// Result length: 0, original length: 826
// [=====================================================================] 826 / 826
// Removed 10 questions
// ==============================================================================================
// Result length: 816, original length: 826, removed 10 questions
// rmduplicates: 488.853ms
// File written
const minpercent = 95
const resultDbFileName = 'res.json'
const line =
'===================================================================='
const logPath = './duplicateRemovingLog/'
const globalLog = './duplicateRemovingLog/log'
utils.CreatePath(logPath)
utils.WriteFile('', globalLog)
const params = process.argv.splice(2)
const fileA = params[0]
const fileB = params[1]
console.time('load')
const dbA = loadData(fileA)
const dbB = fileB ? loadData(fileB) : null
console.timeEnd('load')
console.time('rmduplicates')
if (!dbB) {
log(`Removing duplicate questions from ${fileA}`)
rmDuplicates(dbA).then((res) => {
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName)
log('File written')
})
} else {
log(
`Removing questions found in ${C('green')}${fileB}${C()} from ${C(
'green'
)}${fileA}${C()}`
)
difference({ dbA: dbA, dbB: dbB }).then((res) => {
console.timeEnd('rmduplicates')
writeData(res, resultDbFileName)
log('File written')
})
}
async function rmDuplicates(db) {
return await difference({ dbA: db })
}
async function difference({ dbA, dbB }) {
const doingDifference = !!dbB
// Stuff only from A
const resultDb = []
let dbLength = 0
let removedTotal = 0
iterateSubjects(dbA, () => {
dbLength++
})
const getResultDbLength = () => {
let resultDbLength = 0
iterateSubjects(resultDb, () => {
resultDbLength++
})
return resultDbLength
}
for (let i = 0; i < dbA.length; i++) {
const subj = dbA[i]
const subjLogPath = logPath + subj.Name
utils.WriteFile('', subjLogPath)
let removedCount = 0
hr()
log(
`${C('blue')}${i + 1} / ${dbA.length}: ${C('green')}${subj.Name}, ${C(
'blue'
)}${subj.Questions.length}${C('green')} questions${C()}`
)
for (let j = 0; j < subj.Questions.length; j++) {
const question = subj.Questions[j]
const searchRes = await search({
qdb: doingDifference ? dbB : resultDb,
subjName: subj.Name,
question: question,
searchInAllIfNoResult: doingDifference,
searchTillMatchPercent: minpercent,
})
printProgressBar(j + 1, subj.Questions.length)
const res = hasRequiredPercent(searchRes, minpercent)
// no result: adding to difference
if (res.length === 0) {
// no result: adding to difference
addQuestion(resultDb, subj.Name, question)
} else {
// has result, not adding to difference
utils.AppendToFile(
line +
'\n' +
line +
'\n' +
JSON.stringify(question, null, 2) +
'\n' +
line +
JSON.stringify(res, null, 2) +
'\n',
subjLogPath
)
removedCount++
removedTotal++
}
}
log(
`${C('yellow')}Removed ${C('red')}${removedCount}${C(
'yellow'
)} questions${C()}`
)
}
hr()
log(
`Result length: ${getResultDbLength()}, original length: ${dbLength}, removed ${removedTotal} questions`
)
return resultDb
}
function hasRequiredPercent(result, minpercent) {
return result.reduce((acc, res) => {
if (res.match >= minpercent) {
acc.push(res)
}
return acc
}, [])
}
// ---------------------------------------------------------------------------------
// db editing tools
// ---------------------------------------------------------------------------------
function search({ qdb, subjName, question, searchInAllIfNoResult }) {
return new Promise((resolve) => {
resolve(
doSearch(qdb, subjName, question, null, minpercent, searchInAllIfNoResult)
)
})
}
function iterateSubjects(db, fn) {
db.forEach((subj) => {
subj.Questions.forEach((question) => {
fn(subj, question)
})
})
}
// ---------------------------------------------------------------------------------
// logging and tools
// ---------------------------------------------------------------------------------
function hr() {
let res = ''
for (let i = 0; i < process.stdout.columns; i++) {
res += '='
}
log(`${C('cyan')}${res}${C()}`)
}
function log(text) {
console.log(text)
utils.AppendToFile(text, globalLog)
}
function writeInSameLine(text, returnToLineStart) {
process.stdout.write(text)
if (returnToLineStart) {
process.stdout.write('\r')
} else {
process.stdout.write('\n')
}
}
function printProgressBar(current, total) {
const width = process.stdout.columns - 25
if (width <= 0) {
return
}
const x = width / total
const xCurrent = Math.floor(current * x)
const xTotal = Math.floor(total * x)
let line = ''
for (let i = 0; i < xCurrent; i++) {
line += '='
}
for (let i = 0; i < xTotal - xCurrent; i++) {
line += ' '
}
const numbers = `${current} / ${total}`
writeInSameLine(
`${C('magenta')} [${line}]${C('green')} ${numbers}${C()}`,
current !== total
)
}
function C(color) {
return logger.C(color)
}